Wednesday, May 9, 2018

SAS: My first SAS program using regular expressions

SAS: My first SAS program using regular expressions



%let VARS =

     fullpath

     filename

     extension

     filetype

     instrument

     date

     time

;



%let DATE_REGEX=((\d{4})-(\d{2})-(\d{2}))|(\w\w\w-\d\d-\d\d\d\d); * Example: 2018-05-09 or Jan-29-2017;

%let TIME_REGEX=(\d{2})_(\d{2})_(\d{2});                                    * Example: 10_30_58 ;

%let DATETIME_REGEX = (20\d\d\d\d\d\d-\d\d\d\d\d\d);                  * Example: 20180509-103058 ;

%let INSTRUMENT_REGEX=(\w\d\d\d\d\w\d\d\d\d)|(\\(\d{4})[\\\s_-]); * Example: D1234E5678 or \1024_ ends with slash blank underscore or hyphen ;



data work.files_parsed (keep=&VARS);



attrib fullpath       length=$256;

attrib filename       length=$ 64;

attrib extension length=$  8;

attrib filetype       length=$  6;

attrib instrument     length=$ 10;

attrib date                length=   8  format=&GLBL_DATE_FORMAT;

attrib time                length=   8  format=&GLBL_TIME_FORMAT;



retain date_pattern 0;                * id of compiled regex pattern for DATE_REGEX ;

retain time_pattern 0;                * id of compiled regex pattern for TIME_REGEX ;

retain datetime_pattern 0;       * id of compiled regex pattern for DATETIME_REGEX ;

retain instrument_pattern 0;     * id of compiled regex pattern for INSTRUMENT_REGEX ;



set work.files;



if (_N_ = 1) then do;

     date_pattern = prxparse("/&DATE_REGEX./");                  * compile regex ;

     put date_pattern=;

     time_pattern = prxparse("/&TIME_REGEX./");                  * compile regex ;

     put time_pattern=;

     datetime_pattern = prxparse("/&DATETIME_REGEX./");    * compile regex ;

     put datetime_pattern=;

     instrument_pattern = prxparse("/&INSTRUMENT_REGEX./");  * compile regex ;

     put instrument_pattern=;

end;



if (isFolder = 1) then delete;



if (index(memname, "sfscli.log")             > 0) then delete;         * has no relevent data ;

if (index(memname, ".xlsx")                  > 0) then delete;         * has no relevant data ;

if (index(memname, "System Information.txt") > 0) then delete;         * has no relevant data ;

if (index(memname, ".pdf")                   > 0) then delete;         * not going there... ;



memname = tranwrd(memname, "/", "\");

fullpath = memname;



slash = find(memname, "\", -999);

if (slash > 0) then do;

     filename = substr(memname, slash+1);

end;

else do;

     put " Record " _N_ " has no slash.";

     delete;

end;



memname = filename;  * less to work with going forward... ;



period = find(memname, ".", -999);

if (period > 0) then do;

     extension = upcase(substr(memname, period+1));

     memname = substr(memname, 1, period - 1);

end;

else do;

     put " Record " _N_ " has no extension.";

     delete;

end;



if (index(filename, "ARC 4") > 0) then do;  * special case...one person ;

     filetype = "SNP";

end;

else do;

     filetype = upcase(substr(memname, length(memname) - 2));  * expect DNP, SNP, EVT, CSV ;

     if (first(filetype) >= "0" and first(filetype) <= "9") then do;

           put " Record " _N_ " does not appear to have a recognized filetype. " fullpath=;

           delete;

     end;

     else do;

           memname = substr(memname, 1, length(memname) - 3);

     end;

end;



* must check for date and time, separately, on file name, before   ;

* attempting to check for date and time, combined, on folder name. ;



pos = prxmatch(date_pattern, memname);

if (pos > 0) then do;

    CALL PRXSUBSTR (date_pattern, memname, position, length);

     if (length = 10) then do;

         date = input(substr(memname, position, length), yymmdd10.);

     end;

     else if (length = 11) then do;

           date = input(substr(memname, position, length), anydtdte11.);

     end;

end;

else do;

     date = . ;

end;



pos = prxmatch(time_pattern, memname);

if (pos > 0) then do;

    CALL PRXSUBSTR (time_pattern, memname, position, length);

    time = input(substr(memname, position, length), time8.);

end;

else do;

     time = . ;

end;



if (date = . and time = .) then do;

     pos = prxmatch(datetime_pattern, fullpath);

     if (pos > 0) then do;

         CALL PRXSUBSTR (datetime_pattern, fullpath, position, length);

           date = input(substr(fullpath, position, 8), yymmdd8.);

           time = input(substr(fullpath, position + 9, 6), hhmmss6.);

     end;

end;



if (upcase(filename) = upcase("MPU_EVENT_LOG-EVT.dat")) then do;

     * date and time are not required for these files ;

end;

else do;

     if (date = .) then do;

           put " Record " _N_ " has no date. " fullpath=;

           delete;

     end;

     else if (time = .) then do;

           put " Record " _N_ " has no time. " fullpath=;

           delete;

     end;

end;



pos = prxmatch(instrument_pattern, fullpath);

if (pos > 0) then do;

    CALL PRXSUBSTR (instrument_pattern, fullpath, position, length);

    if (length = 6) then do;

           instrument = substr(fullpath, position + 1, length - 2);  * remove leading slash, trailing character ;

     end;

     else do;

           instrument = substr(fullpath, position, length);

     end;

end;

else do;

     put " Record " _N_ " has no instrument. " fullpath=;

     delete;

end;



run;









Monday, May 7, 2018

R: Scope of variables within user defined functions



R: Scope of variables within user defined functions


Source: https://stackoverflow.com/questions/10904124/global-and-local-variables-in-r

Variables declared inside a function are local to that function. For instance:

foo <- function() {
   bar <- 1
}
foo()
bar

gives the following error: Error: object 'bar' not found.


If you want to make bar a global variable, you should do:

foo <- function() {
    bar <<- 1
}
foo()
bar

In this case bar is accessible from outside the function.



However, unlike C, C++ or many other languages, brackets do not determine the scope of variables. For instance, in the following code snippet:

if (x > 10) {
    y <- 0
}
else {
    y <- 1
}

y remains accessible after the if-else statement.



PowerShell: Enable execution of PS scripts of local machine




PowerShell: Enable execution of PS scripts of local machine


Open command window with Run As Administrator

Type powershell and press Enter.

Get-ExecutionPolicy -List
Set-ExecutionPolicy Bypass
Get-ExecutionPolicy -List  (you will see LocalMachine is now Bypass)

exit


(There is a scope parameter to specify other than LocalMachine.)

R: ts function gotchas...




R: ts function gotchas...



> # It appears R is replicating data when creating a time series where

> # requested date range exceeds available data.

>

> # The moral of the story is make sure there are no gaps in data

> # prior to creating the time series!!!

>

> df = data.frame()

>

> x = 10; yr = 2016; mo = 01; df = rbind(df, data.frame(x, yr, mo))

> x = 12; yr = 2016; mo = 02; df = rbind(df, data.frame(x, yr, mo))

> x = 15; yr = 2016; mo = 03; df = rbind(df, data.frame(x, yr, mo))

> x = 19; yr = 2016; mo = 04; df = rbind(df, data.frame(x, yr, mo))

>

> x = 11; yr = 2016; mo = 05; df = rbind(df, data.frame(x, yr, mo))

> x = 13; yr = 2016; mo = 06; df = rbind(df, data.frame(x, yr, mo))

> x = 16; yr = 2016; mo = 07; df = rbind(df, data.frame(x, yr, mo))

> x = 20; yr = 2016; mo = 08; df = rbind(df, data.frame(x, yr, mo))

>

> x = 12; yr = 2016; mo = 09; df = rbind(df, data.frame(x, yr, mo))

> x = 14; yr = 2016; mo = 10; df = rbind(df, data.frame(x, yr, mo))

> x = 17; yr = 2016; mo = 11; df = rbind(df, data.frame(x, yr, mo))

> x = 21; yr = 2016; mo = 12; df = rbind(df, data.frame(x, yr, mo))

>

> x = 13; yr = 2017; mo = 01; df = rbind(df, data.frame(x, yr, mo))

> x = 15; yr = 2017; mo = 02; df = rbind(df, data.frame(x, yr, mo))

> x = 18; yr = 2017; mo = 03; df = rbind(df, data.frame(x, yr, mo))

> x = 22; yr = 2017; mo = 04; df = rbind(df, data.frame(x, yr, mo))

>

> print(df)

    x   yr mo

1  10 2016  1

2  12 2016  2

3  15 2016  3

4  19 2016  4

5  11 2016  5

6  13 2016  6

7  16 2016  7

8  20 2016  8

9  12 2016  9

10 14 2016 10

11 17 2016 11

12 21 2016 12

13 13 2017  1

14 15 2017  2

15 18 2017  3

16 22 2017  4

>

> # get first and last month and year

> min_yr = min(df$yr)

> min_mo= min(subset(df, yr==min_yr)$mo)

> max_yr = max(df$yr)

> max_mo= max(subset(df, yr==max_yr)$mo)

>

> print(paste0("Have values from ", min_mo, "/", min_yr, " thru ", max_mo, "/", max_yr))

[1] "Have values from 1/2016 thru 4/2017"

>

> ts = ts(df$x, start=c(min_yr, min_mo), end=c(max_yr, max_mo), frequency=12)

> print(ts)

     Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

2016  10  12  15  19  11  13  16  20  12  14  17  21

2017  13  15  18  22                               

>

> max_mo = 8  # data does not really go out that far

> ts2 = ts(df$x, start=c(min_yr, min_mo), end=c(max_yr, max_mo), frequency=12)

> print(ts2)

     Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

2016  10  12  15  19  11  13  16  20  12  14  17  21

2017  13  15  18  22  10  12  15  19               

> # N O T E: I do not have data for May-Aug 2017 so it uses Jan-Apr 2015 ! ! !

>

> # drop one row, Oct 2017

> df = subset(df, !(yr == 2016 & mo == 10))

> print(df)

    x   yr mo

1  10 2016  1

2  12 2016  2

3  15 2016  3

4  19 2016  4

5  11 2016  5

6  13 2016  6

7  16 2016  7

8  20 2016  8

9  12 2016  9

11 17 2016 11

12 21 2016 12

13 13 2017  1

14 15 2017  2

15 18 2017  3

16 22 2017  4

> max_mo = 4  # undo prior test

> ts3 = ts(df$x, start=c(min_yr, min_mo), end=c(max_yr, max_mo), frequency=12)

> print(ts3)

     Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

2016  10  12  15  19  11  13  16  20  12  17  21  13

2017  15  18  22  10                               

> # N O T E: This shows df dates are ignored when creating time series ! ! !