/************************************************************************************************************
This SAS program is designed to assist the NAACCR community in creating local SEER*Stat databases at 
the census tract 2010 level of geography. This program contains 4 sections:

   A - the first reads the population file you will dowbload from https://seer.cancer.gov/censustract-pops/
       and writes population records that match your incidence file.

   B - the second reads either NAACCR, NPCR or SEER Nov22 XML cancer incidence submission files. 
       This section harnesses SAS code and macros written by Fabian Depry (IMS), adds SAS labels, and removes fields from the SAS
       datasets that are 100% missing (blank). This program works with naaccr-xml-utility-9.0+
       If SEER registries want to include IHS linkage results for use in race categorization, they will need to insert the linkage results
       into their SEER submission file first, then run this program.

   C - the third reads the time-dependent census tract attributes file you will download from 
       https://seer.cancer.gov/seerstat/variables/countyattribs/census-tract-attribs.html
       and writes attribute records that match your incidence and population files. This section uses your incidence and population
       files to find all combinations of state-county-tract-year (including invalid and missing combinations) 
       and builds a census tract attributes file to match your data. 

   D - the fourth section writes the cancer incidence .CSV files for use in SEER*Prep.

Section B of This program harnesses SAS code for reading NAACCR XML files. For more information, see: 
https://education.naaccr.org/products/sassafras-reading-and-writing-naaccr-v21-xml-using-sas
https://www.naaccr.org/analysis-and-data-improvement-tools/#TRANSLATION

You will need to make changes to each row of the SAS code marked at the end with /* NEED TO CHANGE THIS */

************************************************************************************************************/


*** SECTION A - READ AND WRITE POPULATION FILE ***;


*** STEP 1. Specify the folder location that SAS will use to write HTML output files. ***;
ods html path="R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\Idaho_test"                                         /* NEED TO CHANGE THIS */
        gpath="R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\Idaho_test"(URL=none);                              /* NEED TO CHANGE THIS */
ods html newfile=proc; 


*** STEP 2. Read the population file for the entire U.S. - the one you downloaded from the NCI website. ***;
*** SPECIFY THE PATH AND FILENAMES FOR THE INPUT POPULATION DATA (pops_in) ***;
*** THE OUTPUT DATASET (pops_SP) MUST HAVE .TXD EXTENSION FOR SEER*PREP ***;
*** MODIFY THE 2 LINES BELOW ***;
filename pops_in 'R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\Populations\us.2006_2020.tract.level.pops.txt';  /* NEED TO CHANGE THIS */
filename pops_SP 'R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\Populations\id.2006_2020.tract.pops.txd';        /* NEED TO CHANGE THIS */

data pops;
  infile pops_in lrecl = 30 pad missover; 
input 
@01 Year     4.
@05 State_A  $Char2. 
@07 State_F  2.
@09 County   3.
@12 Tract    $Char6.
@18 Race     1.
@19 Sex      1.
@20 AgeGrp   2.
@22 Pop      9.;
run;


*** MODIFY THE WHERE STATEMENT TO SELECT YOUR STATE, COUNTIES, OR YEARS OF DIAGNOSIS TO MATCH YOUR INCIDENCE DATA. ***;
proc sort data=pops out=statepops; 
   where state_A in ('ID');                                                                                                         /* NEED TO CHANGE THIS */
   by year state_F county;
run;   

*** STEP 3 - WRITE POPULATION DATA FOR SEER*PREP ***;
title1 'FREQUENCIES OF SELECT VARIABLES FROM POPULATION FILE';
proc freq data=statepops;
   tables State_A County year;
run; 
   
data _null_;
   set statepops;
   file pops_SP lrecl= 30 pad; 
put
@01 Year     4.
@05 State_A  $2.
@07 State_F  z2.
@09 County   z3.
@12 tract  $6.
@18 race   z1.
@19 sex    1.
@20 agegrp z2.
@22 Pop    z9.;
run;



*** SECTION B - READ INCIDENCE DATA ***;


*** STEP 1. Use Fabian's tools to Read an XML file into SAS.
*** Specify the locaton of the read_naaccr_xml_macro.sas, the path for the SAS jar file, the source XML data file          ***; 
*** (this can read GZIP or unzipped XML files), and the CSV version of the user-defined data dictionary (if there is one). ***;

/************************************************************************************************************
    Original Comments from Fabian's "read" example:

    This programs demonstrates how to include and call the "read_naaccr_xml_macro.sas" macro.
    This example assumes that the JAR file [Java ARchive] is in the same folder as this program.
    Make sure that the naaccrVersion and recordType are correct or some data items won't be correctly populated.

    Commented out in the code below, this example includes two items, meaning that only those two items will be included in the resulting
    date set. That parameter is optional and if not provided, the data set will contain all standard
    items plus any non-standard items provided via the extra dictionary. Be aware that creating a data set
    containing all items will be MUCH slower than creating one for just a few items, and so if you only need
    a handful of items to do your analysis, it is strongly recommended to provide those items (you can
    check the official NAACCR documentation (http://datadictionary.naaccr.org/default.aspx?Version=22)
    to find the NAACCR XML IDs to use in that list).
    The new parameter added in 8.6, cleanupcsv, can be set to "no" to aid in debugging. The default value is "yes." 
    This example references an extra user-defined dictionary that defines non-standard NAACCR data items. If
    your data file only contains standard data items, that dictionary is not needed. Otherwise the dictionary
    should have been provided by the organization that created the XML data file. Dictionaries are usually
    in XML format, but for technical reasons, the macro expects them in CSV format; the NAACCR XML Tool that
    is distributed with the macros has an option to load a dictionary and save it as CSV. The Word document here
    https://www.naaccr.org/analysis-and-data-improvement-tools/#TRANSLATION describes how to do this in detail.
    - dictfile is the path to an optional user-defined dictionary in CSV format (the NAACCR XML Tool that
        is distributed with the macros has an option to load an XML dictionary and save it as CSV).
        File*Pro can also generate those files. Use spaces to separate multiple paths if you need to
        provide more than one dictionary.
   *** IMPORTANT - The folder path(s) for the user-defined dictionaries cannot contain spaces.
 ************************************************************************************************************/

%include "R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V22\XML_resources\naaccr-xml-utility-9.0\sas\read_naaccr_xml_macro.sas";          /* NEED TO CHANGE THIS */
%readNaaccrXml(
  libpath="R:\EPI\CANCER\NAACCR\DataStandardsAndDictionary\V22\XML_resources\naaccr-xml-utility-9.0\sas\naaccr-xml-9.0-sas.jar",            /* NEED TO CHANGE THIS */
  sourcefile="R:\EPI\CANCER\Call_For_Data\2022-Nov-Call-for-Data\NAACCR\Submission\ID9521V22.xml",                                          /* NEED TO CHANGE THIS */
  naaccrversion="220", 
  recordtype="I",
  dataset=fromxml,
/* items="patientIdNumber,primarySite", */
  dictfile="R:\EPI\CANCER\Call_For_Data\2022-Nov-Call-for-Data\EXT-02\ID.ext02.20221020103929.naaccr-call-for-data-dec2022-dictionary.csv", /* NEED TO CHANGE THIS */
  cleanupcsv="yes"
);

title1 'List of all NAACCR standard data items plus non-standard data items specified in the dictfile';
proc contents data=fromxml position;
run;


*** STEP 2. Create SAS variable labels.                                                                                                ***;
***  a. In the 'filename dicts...' below, put the path and name of the NAACCR 22 base dictionary CSV on the first line. You extracted  ***;
***     this in Step 4 (a) of the Word document instructions or it could have been provided by your registry software or obtained      ***;
***     from NAACCR.org.                                                                                                               ***;
***     The Word instructions are "Instructions-for-ReadWrite_NAACCR_22_XML_tidy.sas_20220926.docx."                                   ***;
***  b. In the 'filename dicts...' below, put the path and name of the user-defined dictionary CSV that you extracted in Step 4 (b)    ***;
***     of the Word document instructions on the second line or was provided by your software vendor or standard setter. If you do not ***;
***     have a user-defined dictionary, leave this blank.                                                                              ***;
***  c. In the 'filename labels...' below, put the path where you want to store a text file of SAS labels statements. This will be     ***;
***     used in Step 4 to label the SAS variables. Leave the filename as-is, "label-statements.txt".                                   ***;
filename dicts ("R:\EPI\CANCER\Call_For_Data\2022-Nov-Call-for-Data\EXT-02\ID.ext02.20221020103929.base-naaccr-dictionary.csv"      /* NEED TO CHANGE THIS */
                "R:\EPI\CANCER\Call_For_Data\2022-Nov-Call-for-Data\EXT-02\ID.ext02.20221020103929.naaccr-call-for-data-dec2022-dictionary.csv");                /* NEED TO CHANGE THIS */
filename labels "R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\Idaho_test\label-statements.txt";                 /* NEED TO CHANGE THIS */

data dict2labels;
   infile dicts delimiter = ',' MISSOVER DSD lrecl=32767 firstobs=2;
   format NAACCR_XML_ID      $32. 
          NAACCR_Number      best12. 
          Name               $100. 
          Start_Column       best12.
          Length             best12. 
          Record_Types       $20. 
          Parent_XML_Element $20.
          Data_Type          $20. 
          labelstatement     $200.;
   input
      NAACCR_XML_ID      $
      NAACCR_Number
      Name               $
      Start_Column       $
      Length
      Record_Types       $
      Parent_XML_Element $
      Data_Type          $;
   labelstatement = "label "||strip(NAACCR_XML_ID)||" = '"||strip(NAACCR_Number)||"_"||strip(Name)||" - "||strip(NAACCR_XML_ID)||" - "||strip(Parent_XML_Element)||"';";
   if NAACCR_XML_ID='NAACCR XML ID' then delete;
   file labels;
   put labelstatement;
run;
title1;
proc print data=dict2labels noobs n;
   var labelstatement;
run;


*** STEP 3. This is the TIDY part of the code that labels the SAS variables and drops variables that are 100% missing. If you looked         ***;
*** at the proc contents results from Step 2, you may have noticed that every single variable in the NAACCR V22 data dictionary is included, ***;
*** even though some of the variables were not included in the data file.                                                                    ***; 

*** The SAS Log will show notes for the variables not in the data file, but that you tried to label, indicating that the variable is         ***;
*** uninitialized, for example "NOTE: Variable textStaging is uninitialized." Don't worry about these notes. :)                              ***;

data xml_labeled;
   set fromxml;

   *** LIMIT TO STATE/CATCHMENT AREA CASES ***;
   where addrAtDxState in ('ID');                                                                                                   /* NEED TO CHANGE THIS */        
   %include labels;
run;

* Get list of variables ;
proc transpose data=xml_labeled (obs=0) out=names ;
   var _all_;
run;

* Generate code to count non-missing values ;
filename code temp;
data _null_;
   file code ;
   set names end=eof;
   if _n_=1 then put
 'proc sql noprint;'
/'create table _counts as select' 
/' ' @
  ; else put ',' @ ;
  put 'sum(not missing(' _name_ ')) as ' _name_ ;
  if eof then put
 'from xml_labeled'
/';'
/'quit;'
  ;
run;

* Run generated code ;
%include code / source2 ;

* Generate DROP statement ;
filename code temp;
data _null_;
  set _counts ;
  array c _numeric_;
  file code lrecl=80 ;
  length _name_ $32 ;  
  put 'drop ' @;
  do _n_=1 to dim(c);
   if c(_n_)=0 then do ;
     _name_ = vname(c(_n_));
     put _name_ @ ;
  end;
  end;
  put ';' ;
run;

* Make version of data without empty variables ;
data xml_labeled_tidy ;
  set xml_labeled;
  %include code / source2;
run;

proc contents data=xml_labeled_tidy position;
run;



*** SECTION C - READ ATTRIBUTES DATA. LATER, WILL TAKE ONE FIELD FROM ATTRIBUTES DATA AND INSERT IT INTO INCIDENCE DATA ***;


*** STEP 1. Read in U.S. census tract attributes file. Note that this was updated August 5, 2022. ***;

filename attr "R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\TractAttributes\tract.level.ses.2008_17.txt";       /* NEED TO CHANGE THIS */

data attr;
  infile attr lrecl=246;
  input @01 buffer       $Char246.
        @01 State               2.
        @03 County              3.
        @06 Tract_2010          6.
        @12 year_dx             4.
        @147 cancer_reporting_zone $Char10. 
        @157 tract_zone_certainty $Char1.
        ;
   output;

   *** ADD ATTRIBUTE VALUES FOR MAXIMUM YEARS COVERED BY INCIDENCE FILE FOR THIS PROJECT ***;
   if year_dx = 2008 then do; 
      year_dx = 2006; output; 
      year_dx = 2007; output; 
   end;
   if year_dx = 2017 then do; 
      year_dx = 2018; output; 
      year_dx = 2019; output; 
      year_dx = 2020; output; 
   end;
run;
proc sort data=attr;
   by State County Tract_2010 year_dx;
run;
title1 'Frequencies from Attributes Data';
proc freq data=attr;
   tables State year_dx;
run;


*** STEP 2. READ IN STATE-COUNTY-TRACT-YEAR COMBINATIONS FROM YOUR INCIDENCE FILE ***;
data incidence;
   set fromxml;

   *** Restrict cases to 2006-2020. This could be modified by registries to match which years of incidence are geocoded to 2010 census geography ***;
   where substr(dateOfDiagnosis,1,4) >= '2006' and substr(dateOfDiagnosis,1,4) <= '2020';                                           /* MAY NEED TO CHANGE THIS */

   State      = put(STFIPS(addrAtDxState),z2.) + 0;
   County     = countyAtDxAnalysis + 0;
   Tract_2010 = censusTract2010 + 0;
   year_dx    = substr(dateOfDiagnosis,1,4) + 0;
   keep State County Tract_2010 year_dx;
run;
title1 'State, County, Year, and Tract 2010 Frequencies from Incidence File';
proc freq data=incidence;
   format Tract_2010 z6.;
   tables State County year_dx Tract_2010;
run;


*** STEP 3. READ IN STATE-COUNTY-TRACT-YEAR COMBINATIONS FROM YOUR POPULATION FILE ***;
data pops;
  infile pops_SP lrecl = 30 pad missover; 
input 
@01 year_dx    4.
@07 State      2.
@09 County     3.
@12 Tract_2010 6.;
run;
title1 'State, County, Year, and Tract 2010 Frequencies from Population File';
proc freq data=pops;
   format Tract_2010 z6.;
   tables State County year_dx Tract_2010;
run;


*** STEP 4. FIND ALL COMBINATIONS OF STATE-COUNTY-TRACT_2010-YEAR_DX FROM INCIDENCE AND POPULATION FILES ***;
data allcombos;
   set incidence pops;

   *** INPUT VALUS FOR STATE FIPS CODE HERE ***;
   if State  = . then State  = 16;

   if County     = . then County     = 999;    /* need missing to be last in sort order */
   if Tract_2010 = . then Tract_2010 = 999999; /* need missing to be last in sort order */
run;
proc sort data=allcombos nodupkey;
   by State County Tract_2010 year_dx;
run;
title1 'State, County, Year, and Tract 2010 Frequencies from Combination of Incidence and Population Files';
proc freq data=allcombos;
   format Tract_2010 z6.;
   tables State County year_dx Tract_2010;
run;


*** STEP 5. MATCH MERGE COMBINATIONS FROM INCIDENCE AND POPULATION FILES WITH ATTRIBUTES AND WRITE ATTRIBUTES .TXD ***;
*** THIS STEP ALSO CREATES A TEXT FILE WITH THE FORMATS FOR SEER*Prep STATE-COUNTY-TRACT RECODE ***;
*** YOU WILL PASTE THE CONTENTS OF THIS FILE INTO EDIT FORMAT OF SEER*PREP FOR THE STATE-COUNTY-TRACT RECODE VARIABLE ***;  
data allcombos_attr;
  merge allcombos (in=dats) attr;
  by State County Tract_2010 Year_dx;
  if dats=1 then output;
run;

proc sort data = allcombos_attr;
  by State county Tract_2010 Year_dx;
run;
title1 'State, County, Year, and Tract 2010 Frequencies Selected from Attributes File';
proc freq data = allcombos_attr;
   format Tract_2010 z6.;
  tables State county Tract_2010 Year_dx;
run;


*** Specify path and file name of attribute file you will use in SEER*Prep ***;
filename outattr "R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\TractAttributes\Idaho.tract.level.ses.2006_2020.txd";  /* NEED TO CHANGE THIS */


*** Specify path for the text file for the formats for state-county-tract recode ***; 
filename outsct 'R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\TractAttributes\state.county.tract.recode.txt';   /* NEED TO CHANGE THIS */

data _null_;
   set allcombos_attr;
   retain sct_rec (0);
   by State county Tract_2010;
   if first.Tract_2010=1 then do; 
      sct_rec = sct_rec+ 1;
      file outsct;
      put @ 1 sct_rec      z5.
          @ 6 "="
          @ 7 state        z2.
          @ 9 county       z3.
          @12 tract_2010   z6.;
   end;

   file outattr lrecl=246 pad;
*  if County ^= 999 and Tract_2010 ^= 999999 then do;
   if Tract_2010 ^= 999999 and substr(buffer,1,1) ^= ' ' then do;
       put  @01 buffer $char246.
            @12 year_dx 4. 
            @16 sct_rec z5. ;
   end;
   else put @01 State        z2.
            @03 County       z3.
            @06 tract_2010   z6.
            @12 year_dx      4.
            @16 sct_rec      z5.
            @147 cancer_reporting_zone $Char10. 
            @157 tract_zone_certainty  $Char1.
;
run;



*** SECTION D - WRITE CENSUS TRACT-LEVEL INCIDENCE DATASET FOR SEER*PREP ***;


*** STEP 1. Match merge tract_zone_certainty (0,1) into Incidence File so it can be used in Merged variables in SEER*Stat. ***;
*** Assumes countyAtDxAnalysis is mapped to countyAtDxGeocode2010 for 2006-2020 in Nov 2022 data submission. ***;
proc sort data=xml_labeled_tidy;

   *** Restrict cases to 2006-2020. This could be modified by registries to match which years of incidence are geocoded to 2010 census geography ***;
   where substr(dateOfDiagnosis,1,4) >= '2006' and substr(dateOfDiagnosis,1,4) <= '2020';                                           /* MAY NEED TO CHANGE THIS */

   by addrAtDxState countyAtDxAnalysis censusTract2010;
run;
data attr2;
   set attr;
   addrAtDxState      = input(FIPSTATE(put(State,z2.)),$Char2.);
   countyAtDxAnalysis = input(put(County,z3.),$Char3.);
*   censusTract2010    = input(put(tract_2010,z6.),$Char6.);
   keep addrAtDxState countyAtDxAnalysis /* censusTract2010 cancer_reporting_zone */ tract_zone_certainty;
run;  
proc sort data=attr2 nodupkey;
   by addrAtDxState countyAtDxAnalysis /* censusTract2010 */;
run;
*proc freq data=attr2;
*   tables addrAtDxState countyAtDxAnalysis censusTract2010;
*run;

data incidence_plus;
   merge xml_labeled_tidy (in=dats) attr2;
   by addrAtDxState countyAtDxAnalysis /* censusTract2010 */;
   if dats=1 then output;
run;


*** STEP 2. You can now work with the SAS dataset "incidence_plus." You may wish to save it as a permanent SAS dataset (in a folder)   ***;
*** or modify the code below to review the frequencies of variables. You can modify data items, like remove the day components of dates. ***;
*** You can do statistical analysis, etc. OR, write CSV file for SEER*Prep.                                                              ***;
*** The following items were from the Nov22 SEER data submission. If you are using your NPCR data submission file for the base of your   ***;
*** census tract SEER*Stat db, change the data items to match the NPCR submission dictionary. ***;
title1 'FREQUENCIES FOR SELECT NON-STANDARD DATA ITEMS';
proc freq data=incidence_plus;
   tables 
acsPctPovAllRaces                                                                                                                 /* MAY NEED TO CHANGE THIS */
cdcSVI2018                                                                                                                        /* MAY NEED TO CHANGE THIS */
microMatchStatus                                                                                                                  /* MAY NEED TO CHANGE THIS */
tractEstCongressDist                                                                                                              /* MAY NEED TO CHANGE THIS */
cancerReportingZoneTractCert;                                                                                                     /* MAY NEED TO CHANGE THIS */  
run;

title1 'Error Check - should produce no output';
proc freq data=incidence_plus;
   where cancerReportingZoneTractCert = '';
   tables addrAtDxState countyAtDxAnalysis censusTract2010;
run;   


*** STEP 3 - ADD RECODES FOR SEER*PREP ***;  
proc sort data=incidence_plus;
   by patientIdNumber recordNumberRecode;
run;

data case_recodes;
   set incidence_plus;
   by patientIdNumber recordNumberRecode;
   length stateCountyRecode $ 5;

   if countyAtDxAnalysis < '001' then countyAtDxAnalysis = '999';
   if censusTract2010 < '000001' then censusTract2010 = '999999';

   stateCountyRecode = put(STFIPS(addrAtDxState),z2.) || countyAtDxAnalysis;
   censusTractRecode = censusTract2010;
   censusTractCert   = censusTrCertainty2010;

   *** Set behavior code to 3 (malignant) for in situ urinary bladder cases ***;
   if behaviorCodeIcdO3 = '2' and primarySite >= 'C670' and primarySite <= 'C679' and not( 
   histologicTypeIcdO3 in ('9050','9051','9052','9053','9054','9055','9140') or (histologicTypeIcdO3 >= '9590' and histologicTypeIcdO3 <= '9992')) then behaviorCodeIcdO3 = '3';

   *** Race1 is overwritten using logic for 4 broad categories plus unknown. This logic is the same as NAACCR and USCS for producing rates by race. ***;
   *** The original race1 values with fine categories are preserved in a new variable, race1Original ***; 
   race1Original = race1;

   *** define seer race recode (w, b, ai/an, api) ***;
       seerracerec = race1;

      *** if white, check race2 ***;
      if seerracerec = '01' and race2 in ('02','03','04','05','06','07','08','09','10','11','12','13','14','15','16','17','20','21','22','25','26','27','28',
                     '30','31','32','96','97') then seerracerec = race2;       

      *** if white, check ihslink. Apply this logic to all states except Alaska. ***;
      if seerracerec in ('01','98','99') and ihsLink = '1' and addrAtDxState ne 'AK'  then seerracerec = '03';

   *** OVERWRITE RACE1 WITH seerracerec ***;
       race1 = seerracerec;

   *** Define publicReleaseFlag for SEER registries ***;   
   yeardx = substr(dateOfDiagnosis,1,4);
   if '2000' <= yeardx <= '2020' and sex in ('1','2') then publicReleaseFlag = 1;                                                   /* MAY NEED TO CHANGE THIS */
   else publicReleaseFlag = 0;

/*************** CATEGORIZE BREAST CANCER MOLECULAR SUBTYPES (LOGIC WORKS FOR 2011+ CASES) ******************
   C500-C506, C508-C509 :   8000-8700, 8982-8983
   C501-C506, C508-C509 :   8720-8790                                                                      */

   if substr(dateOfDiagnosis,1,4) >= '2011' and behaviorCodeIcdO3 in ('2','3')
   and ((((primarySite >= 'C500' and primarySite <= 'C506') or (primarySite >= 'C508' and primarySite <= 'C509'))
   and ((histologicTypeIcdO3 >= '8000' and histologicTypeIcdO3 <= '8700') or (histologicTypeIcdO3 >= '8982' and histologicTypeIcdO3 <= '8983')))
   or  (((primarySite >= 'C501' and primarySite <= 'C506') or (primarySite >= 'C508' and primarySite <= 'C509'))
   and (histologicTypeIcdO3 >= '8720' and histologicTypeIcdO3 <= '8790'))) then do;
  
      *** 2011-2017 CASES USE csSiteSpecificFactor15 FOR HER2 ***;
      *In data submissions prior to November 2020, borderline ER/PR was classified with positive. Those '30' values are commented out below ***;
      if  substr(dateOfDiagnosis,1,4) >= '2011' and substr(dateOfDiagnosis,1,4) <= '2017' then do; 
         if (csSiteSpecificFactor1=10 /* or csSiteSpecificFactor1=30 */ or csSiteSpecificFactor2=10 /* or csSiteSpecificFactor2=30 */) AND csSiteSpecificFactor15=20 then mol_subtype = 1 ;      *HR+/HER2-;
         else if csSiteSpecificFactor1=20 AND csSiteSpecificFactor2=20 AND csSiteSpecificFactor15=20 then mol_subtype = 2 ;                                *Triple negative;
         else if (csSiteSpecificFactor1=10 /* or csSiteSpecificFactor1=30 */ or csSiteSpecificFactor2=10 /* or csSiteSpecificFactor2=30 */) AND csSiteSpecificFactor15=10 then mol_subtype = 3 ; *HR+/HER2+;
         else if (csSiteSpecificFactor1=20 and csSiteSpecificFactor2=20) AND csSiteSpecificFactor15=10  then mol_subtype= 4 ;                              *HR-/HER2+;
         else if mol_subtype=. then mol_subtype= 5;  
      end;

      *** 2018+ CASES USE SSDIs ***;
      if  substr(dateOfDiagnosis,1,4) >= '2018' then do; 
         if her2OverallSummary='1' then do;
            if estrogenReceptorSummary='1' or progesteroneRecepSummary='1' then mol_subtype = 3 ;       *HR+/HER2+;
            else if estrogenReceptorSummary='0' and progesteroneRecepSummary='0' then mol_subtype= 4 ;  *HR-/HER2+;
            else if mol_subtype=. then mol_subtype= 5;                                                  * UNKNOWN OR BORDERLINE; 
         end;
         else if her2OverallSummary='0' then do;
            if estrogenReceptorSummary='1' or progesteroneRecepSummary='1' then mol_subtype = 1 ;       *HR+/HER2-;
            else if estrogenReceptorSummary='0' and progesteroneRecepSummary='0' then mol_subtype = 2 ; *Triple negative;
            else if mol_subtype=. then mol_subtype= 5;                                                  * UNKNOWN OR BORDERLINE; 
         end;
         else if her2OverallSummary not in ('0','1') then mol_subtype= 5;                               * UNKNOWN; 
      end;
   end;
   else mol_subtype = 9;                                                                                * NOT APPLICABLE;

/*
2018+: HER2 Overall Summary site-specific data item
2011-2017: Collaborative Stage (CS) site-specific factor 15 ("HER2: Summary Result of Testing")
2010: Summary of several CS site-specific factors (described below)
For analysis over time, the following categories are available:

Positive
Negative
Borderline/Unknown*
Not 2010+ Breast
*Due to coding changes in 2018, borderline and unknown are grouped together. Prior to 2018, borderline cases can be isolated using the underlying site-specific factors.

If �Derived HER2 Recode (2010+)� is positive:
If �ER Status Recode Breast Cancer (1990+)� or �PR Status Recode Breast Cancer (1990+)� is positive*, value is Her2+/HR+
If �ER Status Recode Breast Cancer (1990+)� and �PR Status Recode Breast Cancer (1990+)� are both negative, value is Her2+/HR-
Otherwise, value is unknown or borderline*
If �Derived HER2 Recode (2010+)� is negative:
If �ER Status Recode Breast Cancer (1990+)� or �PR Status Recode Breast Cancer (1990+)� is positive*, value is Her2-/HR+
If �ER Status Recode Breast Cancer (1990+)� and �PR Status Recode Breast Cancer (1990+)� are both negative, value is triple negative
Otherwise, value is unknown or borderline*
All other values are unknown

estrogenReceptorSummary
PARENT XML ELEMENT:     Tumor
Description
ER (Estrogen Receptor) Summary is a summary of results of the estrogen receptor (ER) assay.
Rationale
This data item is required for prognostic stage grouping in AJCC 8th edition, Chapter 48, Breast. It was previously collected as Breast CS SSF # 1.
Codes
0     ER negative (0.0% or less than 1%)
1     ER positive
7     Test ordered, results not in chart
9     Not documented in medical record
Cannot be determined (indeterminate)
ER (Estrogen Receptor) Summary status not assessed or unknown if assessed

progesteroneRecepSummary
PARENT XML ELEMENT:     Tumor
Description
PR (Progesterone Receptor) Summary is a summary of results from the progesterone receptor (PR) assay.
Rationale
This data item is required for prognostic stage grouping in AJCC 8th edition, Chapter 48, Breast. It was previously collected as Breast CS SSF # 2.
Codes
0     PR negative (0.0% or less than 1%)
1     PR positive
7     Test ordered, results not in chart
9     Not documented in medical record
Cannot be determined (indeterminate)
PR (Progesterone Receptor) Summary status not assessed or unknown if assessed

her2OverallSummary
0     HER2 negative; equivocal
1     HER2 positive
7     Test ordered, results not in chart
8     Not applicable: Information not collected for this case
(If this item is required by your standard setter, use of code 8 will result in an edit error.)
9     Not documented in medical record
Cannot be determined (indeterminate)
HER2 Overall Summary status not assessed or unknown if assessed
*/

   *** Other restrictions for each registry to consider:
       -Limiting cases to high quality geocodes
       -Delete out of state cases, if any - none should be included in data submittion files
       -Other? ***;
run;
title1 'Frequency Distribution of Diagnosis Year, recoded race1, Breast Cancer Molecular Subtype';
proc freq data=case_recodes;
   tables yeardx;
   tables race1 * seerracerec * race1Original * race2 * ihsLink / list missing;
   tables yeardx * mol_subtype / nocol norow nopercent missing;
run;


*** STEP 4. SPLIT AND EXPORT CSV FILES TO BE READ INTO SEER*PREP ***;
data _null_;
   set case_recodes nobs=nobs;
   splitsize = 300000;
   n_splits  = ceil(nobs / splitsize);
   call symput("cnt", (left(put(nobs,best.))));
   call symput("numsplits", (left(put(n_splits,best.))));
   regtxt = "'"||strip(addrAtDxState)||"'";
   call symput("reg", (left(put(regtxt,$4.))));
   regnoq = strip(addrAtDxState);
   call symput("regnoq", (left(put(regnoq,$2.))));
   strtxt = '$'||strip(n_splits*5)||'.';
   call symput("strglen", (left(put(strtxt,$12.)))); 
run;

data fnamestring;
   length fnamestring &strglen.; 
   fnamestring = &reg.||'1';
   do i = 2 to &numsplits.;
      fnamestring = strip(fnamestring)||' '||&reg.||left(put(i,$2.));
   end;
   call symput("dsnames", (left(put(fnamestring,&strglen.))));
run;

data case_recodes2;
   format outtxtstring exptxtstring1-exptxtstring4 $100. exptxtstring $400.;
   set case_recodes;
   by patientIdNumber recordNumberRecode;   
   retain filenum (1) ind (0);
   if (&numsplits > 1 & _n_ > (ind + (&cnt/&numsplits)) & first.patientIdNumber) then do;
      ind = _n_;
      filenum = filenum + 1;
   end;  
   outtxtstring = 'if filenum = '||filenum||' then output '||left(addrAtDxState)||left(filenum)||';';
   exptxtstring1= 'proc export data=work.'||left(addrAtDxState)||left(filenum)||' outfile="';

   *** SPECIFY THE OUTPUT PATH FOR YOUR INCIDENCE CSV FILES. END WITH \. EXAMPLE LINE= 
       outtxtstring3=R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2022\Webinar-20220425\Demo\; ***;
   exptxtstring2="R:\EPI\CANCER\NCI_SEER\SEERStat\CensusTract_Local_Databases_2023\Idaho_test\";                                    /* NEED TO CHANGE THIS */

   exptxtstring3=left(addrAtDxState)||left(filenum);
   exptxtstring4='.recoded.csv" dbms=csv replace; run;'; 
   exptxtstring = strip(exptxtstring1)||strip(exptxtstring2)||strip(exptxtstring3)||strip(exptxtstring4);
   drop exptxtstring1-exptxtstring4;
run; 
title1 'Number of output CSV files and records per file, outtxtstring exptxtstring';
proc freq data=case_recodes2;
   tables filenum outtxtstring exptxtstring;
run;


*** SPLIT INTO MULTIPLE SAS DATASETS ***;
proc sort data=case_recodes2 out=filetext nodupkey;
   by outtxtstring;
run;
filename toincl temp;
data _null_;
   set filetext;
   file toincl;
   put outtxtstring $;
run;
data &dsnames.;
   set case_recodes2;
   %include toincl;
   drop outtxtstring exptxtstring yeardx seerracerec filenum ind;
run;


*** EXPORT EACH OF THE SAS DATASETS TO CSV FOR USE IN SEERPREP ***;
proc sort data=case_recodes2 out=exporttext nodupkey;
   by exptxtstring;
run;
filename toexp temp;
data _null_;
   set exporttext;
   file toexp;
   put exptxtstring $;
run;
data _null_;
   %include toexp;
run;