Wednesday, May 9, 2018

SAS: My first SAS program using regular expressions

SAS: My first SAS program using regular expressions



%let VARS =

     fullpath

     filename

     extension

     filetype

     instrument

     date

     time

;



%let DATE_REGEX=((\d{4})-(\d{2})-(\d{2}))|(\w\w\w-\d\d-\d\d\d\d); * Example: 2018-05-09 or Jan-29-2017;

%let TIME_REGEX=(\d{2})_(\d{2})_(\d{2});                                    * Example: 10_30_58 ;

%let DATETIME_REGEX = (20\d\d\d\d\d\d-\d\d\d\d\d\d);                  * Example: 20180509-103058 ;

%let INSTRUMENT_REGEX=(\w\d\d\d\d\w\d\d\d\d)|(\\(\d{4})[\\\s_-]); * Example: D1234E5678 or \1024_ ends with slash blank underscore or hyphen ;



data work.files_parsed (keep=&VARS);



attrib fullpath       length=$256;

attrib filename       length=$ 64;

attrib extension length=$  8;

attrib filetype       length=$  6;

attrib instrument     length=$ 10;

attrib date                length=   8  format=&GLBL_DATE_FORMAT;

attrib time                length=   8  format=&GLBL_TIME_FORMAT;



retain date_pattern 0;                * id of compiled regex pattern for DATE_REGEX ;

retain time_pattern 0;                * id of compiled regex pattern for TIME_REGEX ;

retain datetime_pattern 0;       * id of compiled regex pattern for DATETIME_REGEX ;

retain instrument_pattern 0;     * id of compiled regex pattern for INSTRUMENT_REGEX ;



set work.files;



if (_N_ = 1) then do;

     date_pattern = prxparse("/&DATE_REGEX./");                  * compile regex ;

     put date_pattern=;

     time_pattern = prxparse("/&TIME_REGEX./");                  * compile regex ;

     put time_pattern=;

     datetime_pattern = prxparse("/&DATETIME_REGEX./");    * compile regex ;

     put datetime_pattern=;

     instrument_pattern = prxparse("/&INSTRUMENT_REGEX./");  * compile regex ;

     put instrument_pattern=;

end;



if (isFolder = 1) then delete;



if (index(memname, "sfscli.log")             > 0) then delete;         * has no relevent data ;

if (index(memname, ".xlsx")                  > 0) then delete;         * has no relevant data ;

if (index(memname, "System Information.txt") > 0) then delete;         * has no relevant data ;

if (index(memname, ".pdf")                   > 0) then delete;         * not going there... ;



memname = tranwrd(memname, "/", "\");

fullpath = memname;



slash = find(memname, "\", -999);

if (slash > 0) then do;

     filename = substr(memname, slash+1);

end;

else do;

     put " Record " _N_ " has no slash.";

     delete;

end;



memname = filename;  * less to work with going forward... ;



period = find(memname, ".", -999);

if (period > 0) then do;

     extension = upcase(substr(memname, period+1));

     memname = substr(memname, 1, period - 1);

end;

else do;

     put " Record " _N_ " has no extension.";

     delete;

end;



if (index(filename, "ARC 4") > 0) then do;  * special case...one person ;

     filetype = "SNP";

end;

else do;

     filetype = upcase(substr(memname, length(memname) - 2));  * expect DNP, SNP, EVT, CSV ;

     if (first(filetype) >= "0" and first(filetype) <= "9") then do;

           put " Record " _N_ " does not appear to have a recognized filetype. " fullpath=;

           delete;

     end;

     else do;

           memname = substr(memname, 1, length(memname) - 3);

     end;

end;



* must check for date and time, separately, on file name, before   ;

* attempting to check for date and time, combined, on folder name. ;



pos = prxmatch(date_pattern, memname);

if (pos > 0) then do;

    CALL PRXSUBSTR (date_pattern, memname, position, length);

     if (length = 10) then do;

         date = input(substr(memname, position, length), yymmdd10.);

     end;

     else if (length = 11) then do;

           date = input(substr(memname, position, length), anydtdte11.);

     end;

end;

else do;

     date = . ;

end;



pos = prxmatch(time_pattern, memname);

if (pos > 0) then do;

    CALL PRXSUBSTR (time_pattern, memname, position, length);

    time = input(substr(memname, position, length), time8.);

end;

else do;

     time = . ;

end;



if (date = . and time = .) then do;

     pos = prxmatch(datetime_pattern, fullpath);

     if (pos > 0) then do;

         CALL PRXSUBSTR (datetime_pattern, fullpath, position, length);

           date = input(substr(fullpath, position, 8), yymmdd8.);

           time = input(substr(fullpath, position + 9, 6), hhmmss6.);

     end;

end;



if (upcase(filename) = upcase("MPU_EVENT_LOG-EVT.dat")) then do;

     * date and time are not required for these files ;

end;

else do;

     if (date = .) then do;

           put " Record " _N_ " has no date. " fullpath=;

           delete;

     end;

     else if (time = .) then do;

           put " Record " _N_ " has no time. " fullpath=;

           delete;

     end;

end;



pos = prxmatch(instrument_pattern, fullpath);

if (pos > 0) then do;

    CALL PRXSUBSTR (instrument_pattern, fullpath, position, length);

    if (length = 6) then do;

           instrument = substr(fullpath, position + 1, length - 2);  * remove leading slash, trailing character ;

     end;

     else do;

           instrument = substr(fullpath, position, length);

     end;

end;

else do;

     put " Record " _N_ " has no instrument. " fullpath=;

     delete;

end;



run;









No comments:

Post a Comment