/************************************************************************************************************;
This program is to assist the NAACCR community in reading and writing NAACCR XML files using SAS.
This program is to be used in conjunction with the Word document, 
Instructions for ReadWrite_NAACCR_23_XML_tidy.sas_20230829.docx.

This version works with naaccr-xml-utility-10.1. Changes from prior versions are noted in:
https://github.com/imsweb/naaccr-xml/releases

This program harnesses SAS code and macros written by Fabian Depry, IMS, adds SAS labels, and removes fields from the SAS
datasets that are 100% missing (blank).
************************************************************************************************************/;


*** STEP 1. Specify the folder location that SAS will use to write HTML output files. ***;
ods html path="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER" 
        gpath="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER"(URL=none);
ods html newfile=proc; 


*** STEP 2. Use Fabian's tools to Read an XML file into SAS.
*** Specify the locaton of the read_naaccr_xml_macro.sas, the path for the SAS jar file, the source XML data file          ***; 
*** (this can read GZIP or unzipped XML files), and the CSV version of the user-defined data dictionary (if there is one). ***;

/************************************************************************************************************;
    Original Comments from Fabian's "read" example:

    This programs demonstrates how to include and call the "read_naaccr_xml_macro.sas" macro.
    This example assumes that the JAR file [Java ARchive] is in the same folder as this program.
    Make sure that the naaccrVersion and recordType are correct or some data items won't be correctly populated.

    Commented out in the code below, this example includes two items, meaning that only those two items will be included in the resulting
    date set. That parameter is optional and if not provided, the data set will contain all standard
    items plus any non-standard items provided via the extra dictionary. Be aware that creating a data set
    containing all items will be MUCH slower than creating one for just a few items, and so if you only need
    a handful of items to do your analysis, it is strongly recommended to provide those items (you can
    check the official NAACCR documentation (http://datadictionary.naaccr.org/default.aspx?Version=22)
    to find the NAACCR XML IDs to use in that list).
    The new parameter added in 8.6, cleanupcsv, can be set to "no" to aid in debugging. The default value is "yes." 
    04/04/2022 - Fabian Depry - Added new option groupeditems parameter to allow grouped items to be added to the data set.

    This example references an extra user-defined dictionary that defines non-standard NAACCR data items. If
    your data file only contains standard data items, that dictionary is not needed. Otherwise the dictionary
    should have been provided by the organization that created the XML data file. Dictionaries are usually
    in XML format, but for technical reasons, the macro expects them in CSV files; the NAACCR XML Tool that
    is distributed with the macros has an option to load a dictionary and save it as CSV. The Word document
    describes how to do this in detail.
    - dictfile is the path to an optional user-defined dictionary in CSV format (the NAACCR XML Tool that
        is distributed with the macros has an option to load an XML dictionary and save it as CSV);
        File*Pro can also generate those files); use spaces to separate multiple paths if you need to
        provide more than one dictionary.

   Note that the items listed below are those needed to perform the person-level and tumor-level deduplications
   using the new NAACCR Match*Pro protocol. A Type C file is required for many of these fields.

 ************************************************************************************************************/;

%include "R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\naaccr-xml-utility-10.1\sas\read_naaccr_xml_macro.sas";
%readNaaccrXml(
  libpath="R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\naaccr-xml-utility-10.1\sas\naaccr-xml-10.1-sas.jar",
  sourcefile="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER\ID.ext02.20230824141149.seer-transmission-nov2023.xml.gz",
  naaccrversion="230", 
  recordtype="I",
  dataset=fromxml,
/* items="patientIdNumber,nameFirst,nameLast,nameMaiden,nameMiddle,nameBirthSurname,dateOfBirth,socialSecurityNumber,telephone,sex,
  addrCurrentNoStreet,addrCurrentCity,addrCurrentPostalCode,addrCurrentState,addrAtDxNoStreet,addrAtDxCity,addrAtDxPostalCode,addrAtDxState, 
  tumorRecordNumber,sequenceNumberCentral,primarySite,laterality,histologyIcdO2,behaviorIcdO2,histologicTypeIcdO3,behaviorCodeIcdO3,
  dateOfDiagnosis,ageAtDiagnosis,typeOfReportingSource,overRideSiteLatSeqno", */ 
  dictfile="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER\seer-transmission-nov2023-dictionary.csv", 
  cleanuptempfiles="yes",
  groupeditems="no"
);
proc contents data=fromxml position;
run;


*** STEP 3. Create SAS variable labels.                                                                                                ***;
***  a. In the 'filename dicts...' below, put the path and name of the NAACCR 23 base dictionary CSV on the first line. If this was    ***;
***     not provided to you, use Step 4 of the Word document instructions to convert the base dictionary from XML format to CSV.       ***;
***  c. In the 'filename dicts...' below, put the path and name of the user-defined dictionary CSV that you extracted in Step 4        ***;
***     of the Word document instructions on the second line. If you do not have a user-defined dictionary, leave this blank.          ***;
***  c. In the 'filename labels...' below, put the path where you want to store a text file of SAS labels statements. This will be     ***;
***     used in Step 4 to label the SAS variables. Leave the filename as-is, "label-statements.txt".                                   ***;
filename dicts ("R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\base-naaccr-dictionary-230.csv"
                "R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER\seer-transmission-nov2023-dictionary.csv");
filename labels "R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\label-statements.txt";

data dict2labels;
   infile dicts delimiter = ',' MISSOVER DSD lrecl=32767 firstobs=2;
   format NAACCR_XML_ID      $32. 
          NAACCR_Number      best12. 
          Name               $100. 
          Start_Column       best12.
          Length             best12. 
          Record_Types       $20. 
          Parent_XML_Element $20.
          Data_Type          $20. 
          labelstatement     $200.;
   input
      NAACCR_XML_ID      $
      NAACCR_Number
      Name               $
      Start_Column       $
      Length
      Record_Types       $
      Parent_XML_Element $
      Data_Type          $;
   labelstatement = "label "||strip(NAACCR_XML_ID)||" = '"||strip(NAACCR_Number)||"_"||strip(Name)||" - "||strip(NAACCR_XML_ID)||" - "||strip(Parent_XML_Element)||"';";
   if NAACCR_XML_ID='NAACCR XML ID' then delete;
   file labels;
   put labelstatement;
run;
proc print data=dict2labels noobs n;
   var labelstatement;
run;


*** STEP 4. This is the TIDY part of the code that labels the SAS variables and drops variables that are 100% missing. If you looked         ***;
*** at the proc contents results from Step 2, you may have noticed that every single variable in the NAACCR V23 data dictionary is included, ***;
*** even though some of the variables were not included in the data file.                                                                    ***; 

*** The SAS Log will show notes for the variables not in the data file, but that you tried to label, indicating that the variable is         ***;
*** uninitialized, for example "NOTE: Variable textStaging is uninitialized." Don't worry about these notes. :)                              ***;

data xml_labeled;
   set fromxml;
   %include labels;
run;


* Get list of variables ;
proc transpose data=xml_labeled (obs=0) out=names ;
   var _all_;
run;

* Generate code to count non-missing values ;
filename code temp;
data _null_;
   file code ;
   set names end=eof;
   if _n_=1 then put
 'proc sql noprint;'
/'create table _counts as select' 
/' ' @
  ; else put ',' @ ;
  put 'sum(not missing(' _name_ ')) as ' _name_ ;
  if eof then put
 'from xml_labeled'
/';'
/'quit;'
  ;
run;

* Run generated code ;
%include code / source2 ;

* Generate DROP statement ;
filename code temp;
data _null_;
  set _counts ;
  array c _numeric_;
  file code lrecl=80 ;
  length _name_ $32 ;  
  put 'drop ' @;
  do _n_=1 to dim(c);
   if c(_n_)=0 then do ;
     _name_ = vname(c(_n_));
     put _name_ @ ;
  end;
  end;
  put ';' ;
run;

* Make version of data without empty variables ;
data xml_labeled_tidy ;
  set xml_labeled;
  %include code / source2;
run;

proc contents data=xml_labeled_tidy position;
run;


*** STEP 5. You can now work with the SAS dataset "xml_labeled_tidy." You may wish to save it as a permanent SAS dataset (in a folder)   ***;
*** or modify the code below to review the frequencies of variables. You can modify data items, like remove the day components of dates. ***;
*** You can do statistical analysis, etc.                                                                                                ***;
*** The example below is for the Nov 2023 SEER Data Submission, which includes a subset of variables from the standard NAACCR dictionary ***;
*** and additional user-defined variables.                                                                                               ***;
*title1 'FREQUENCIES FOR NAACCR BASE DICTIONARY (STANDARD) DATA ITEMS AND NON-STANDARD DATA ITEMS';
*proc freq data=xml_labeled_tidy;
*   format patientIdNumber $2.;
*   tables _ALL_;
*run;

title1 'FREQUENCIES FOR NON-STANDARD DATA ITEMS';
proc freq data=xml_labeled_tidy;
   format covid19AntibodyTestDate covid19DecisionToDelayDate covid19DxDate covid19ViralTestDate oncotypeDxDcisTestReportDate oncotypeDxPrstReportDate oncotypeDxRsTestReportDate $6.;
   tables 
covid19AntibodyTest
covid19AntibodyTestDate
covid19Bmt
covid19Brm
covid19Changed1stCourseTx
covid19Chemo
covid19DecisionToDelayDate
covid19DelayedCAEvents
covid19Diagnosed
covid19DxDate
covid19Hormone
covid19Radiation
/*covid19RadiationOtherIcb*/
covid19RadiationOtherRt
covid19SurgeryImpact
covid19ViralTest
covid19ViralTestDate
daysToTreatment
/*decipherLinkageFlag*/
/*decisionDxLinkageFlag*/
microMatchStatus
oncotypeDxDcisReasonNoScore
oncotypeDxDcisRiskGroup
oncotypeDxDcisScore
oncotypeDxDcisTestReportDate
oncotypeDxPrstReasonNoScore
oncotypeDxPrstReportDate
oncotypeDxPrstRiskGroup
oncotypeDxPrstRiskGroup1317
oncotypeDxPrstScore
oncotypeDxRecurrenceScore
oncotypeDxRsReasonNoScore
oncotypeDxRsRiskGroup
oncotypeDxRsTestReportDate
penaltyCode
/*pharmacyLinkageFlag*/
;
run;

*** Example code to subset only 2021 cases and sort ***;
proc sort data=xml_labeled_tidy out=example;
   where substr(dateOfDiagnosis,1,4) = '2021';
   by addrAtDxState patientIdNumber tumorRecordNumber;
run;


*** STEP 6. Use Fabian's tools to Write an XML file from SAS.                                                  ***;
*** Specify the locaton of the write_naaccr_xml_macro.sas, the path for the SAS jar file, the target XML file, ***; 
*** the dataset, and the CSV version of the user-defined data dictionary (the same that you used above).         ***;
/************************************************************************************************************;
    Original Comments from Fabian's "write" example:

    This programs demonstrates how to include and call the "write_naaccr_xml_macro.sas" macro.
    While it's possible to use the write macro without the read one, they are really meant to be used together.
    This example assumes that the JAR file is in the same folder as this program.
    Make sure that the naaccrVersion and recordType are correct or some data items won't be correctly populated.
    This example references an extra user-defined dictionary that defines non-standard NAACCR data items. If
    your data file only contains standard data items, that dictionary is not needed. Otherwise the dictionary
    should have been provided by the organization that created the XML data file. Dictionaries are usually
    in XML files, but for technical reasons, the macro expects them in CSV format; the NAACCR XML Tool that
    is distributed with the macros has an option to load a dictionary and save it as CSV. For writing proper
    XML files, the macro also needs the dictionary URI [Uniform Resource Identifier]; since the CSV format 
    doesn't contain that URI, it needs to be provided as a parameter. The URI can be found as a root attribute 
    of the XML dictionary. Note that the dictionary URI values might look like internet addresses, but in general 
    they don't point to an existing web location. That's because URIs (Uniform Resource Identifiers) are not
    URLs (Uniform Resource Locators) but they often use the same convention: a path delimited by slashes, with the 
    beginning of the path representing an organization and each remaining part of the path representing a more
    specific part of the resource. A given dictionary URI can point to an actual web location containing the 
    dictionary, but that is not a requirement, and the URI of the standard NAACCR base dictionaries don't
    reference actual locations. Don't worry; your data are not being sent to a website.     

    03/12/2021 - Fabian Depry - Added new writenum parameter to allow NAACCR numbers to be written.
    the default is writenum="no". If you want to use the XML file in GenEdits Plus, you need to set writenum="yes".
    10/13/2021 - Fabian added new parameter, cleanupcsv, which can be set to "no" to aid in debugging. The default value is "yes." 

   Note that the items listed below are those needed to perform the person-level and tumor-level deduplications
   using the new NAACCR Match*Pro protocol. A Type C file is required for many of these fields.

 ************************************************************************************************************/;

%include "R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\naaccr-xml-utility-10.1\sas\write_naaccr_xml_macro.sas";
%writeNaaccrXml(
  libpath="R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\naaccr-xml-utility-10.1\sas\naaccr-xml-10.1-sas.jar",
  targetfile="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER\your-output-naaccr-xml-data-file.xml.gz",
  naaccrversion="230", 
  recordtype="I",
  dataset=xml_labeled_tidy,
/* items="patientIdNumber,nameFirst,nameLast,nameMaiden,nameMiddle,nameBirthSurname,dateOfBirth,socialSecurityNumber,telephone,sex,
  addrCurrentNoStreet,addrCurrentCity,addrCurrentPostalCode,addrCurrentState,addrAtDxNoStreet,addrAtDxCity,addrAtDxPostalCode,addrAtDxState, 
  tumorRecordNumber,sequenceNumberCentral,primarySite,laterality,histologyIcdO2,behaviorIcdO2,histologicTypeIcdO3,behaviorCodeIcdO3,
  dateOfDiagnosis,ageAtDiagnosis,typeOfReportingSource,overRideSiteLatSeqno", */ 
  dictfile="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER\seer-transmission-nov2023-dictionary.csv",
  dicturi="https://seer.cancer.gov/naaccrxml/seer-transmission-nov2023-dictionary.xml",
  writenum="yes",
  cleanuptempfiles="yes",
  grouptumors="yes"
);



*** STEP 7. FOR TESTING ONLY.
*** FOR COMPARISON OF READ AND WRITE STATEMENTS ***;
*** a. Read in the XML you just wrote. ***;
*%include "R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\naaccr-xml-utility-10.1\sas\read_naaccr_xml_macro.sas";
*%readNaaccrXml(
  libpath="R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V23\naaccr-xml-utility-10.1\sas\naaccr-xml-10.1-sas.jar",
  sourcefile="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER\your-output-naaccr-xml-data-file.xml.gz",
  naaccrversion="230", 
  recordtype="I",
  dataset=fromxml2,
/* items="patientIdNumber,primarySite", */
  dictfile="R:\EPI\CANCER\Call_For_Data\2023-Nov-Call-for-Data\SEER\seer-transmission-nov2023-dictionary.csv",
  cleanuptempfiles="yes",
  groupeditems="no"
);

*title1 "CONTENTS OF STANDARD AND NON-STANDARD DATA ITEMS IN 'sourcefile' above";
*proc contents data=fromxml2 position;
*run;

*** b. Compare the original XML file as read by SAS with the one that was written. ***;
*proc sort data=fromxml;
*   by addrAtDxState patientIdNumber sequenceNumberCentral;
*run;
*proc sort data=fromxml2;
*   by addrAtDxState patientIdNumber sequenceNumberCentral;
*run;
*title1 "RESULTS FROM COMPARISON OF ORIGINAL XML FILE and XML FILE THAT WAS WRITTEN BY SAS";
*proc compare base=fromxml compare=fromxml2 maxprint=(20,10000) LISTBASEOBS LISTCOMPOBS;
*  id addrAtDxState patientIdNumber sequenceNumberCentral;
*run;