datacommonsorg · Harsha-chandaluri · Feb 17, 2025 · Feb 17, 2025 · Feb 17, 2025
diff --git a/scripts/us_census/acs5yr/subject_tables/S1603/README.md b/scripts/us_census/acs5yr/subject_tables/S1603/README.md
@@ -2,7 +2,7 @@
 
 This import gives the count of people based on the language spoken at home. It breaks this further down based on age, nativity and citizenship status, poverty status and educational attainment.
 
-Years: 2010-2019  
+Years: 2010-2023  
 Geo : Country, State, County and Place
 
 Important Notes :

diff --git a/scripts/us_census/acs5yr/subject_tables/S1603/S1603_spec.json b/scripts/us_census/acs5yr/subject_tables/S1603/S1603_spec.json
@@ -301,4 +301,5 @@
     "Percent speak only English at home MOE",
     "PERCENT IMPUTED"
   ]
-}
+}
+
diff --git a/scripts/us_census/acs5yr/subject_tables/S1603/process.py b/scripts/us_census/acs5yr/subject_tables/S1603/process.py
@@ -3,6 +3,8 @@
 import sys
 import pandas as pd
 import numpy as np
+from absl import app, flags
+import json
 
 # pylint: disable=missing-module-docstring
 # pylint: disable=missing-class-docstring
@@ -11,12 +13,37 @@
 #pylint: disable=wrong-import-position
 #pylint: disable=import-error
 _CODEDIR = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(1, os.path.join(_CODEDIR, '..', 'common'))
+sys.path.insert(1, os.path.join(_CODEDIR, '../', 'common'))
 
 from data_loader import SubjectTableDataLoaderBase  # commit hash - aee443ee
 from resolve_geo_id import convert_to_place_dcid
+from generate_col_map import process_zip_file
 #pylint: enable=wrong-import-position
 #pylint: enable=import-error
+FLAGS = flags.FLAGS
+flags.DEFINE_string(
+    'option', 'all',
+    'Specify how to run the process, colmap -- generates column map, process -- runs processing, all -- runs colmap first and then proessing'
+)
+flags.DEFINE_string(
+    'table_prefix', None,
+    '[for processing]Subject Table ID as a prefix for output files, eg: S2702')
+flags.DEFINE_string('spec_path', None, 'Path to the JSON spec [mandatory]')
+flags.DEFINE_string(
+    'input_path', None,
+    'Path to input directory with (current support only for zip files)')
+flags.DEFINE_string('output_dir', './', 'Path to the output directory')
+# flags.DEFINE_string(
+#     'output_dir',
+#     '/Users/sanikap/Desktop/ACS/data/scripts/us_census/acs5yr/output_files/',
+#     'Path to the output directory')
+flags.DEFINE_boolean(
+    'has_percent', False,
+    '[for processing]Specify the datasets has percentage values that needs to be convered to counts'
+)
+flags.DEFINE_boolean(
+    'debug', False,
+    '[for processing]set the flag to add additional columns to debug')
 
 
 class S1603SubjectTableDataLoader(SubjectTableDataLoaderBase):
@@ -57,7 +84,11 @@ def _process_dataframe(self, df, filename):
         }
 
         csv_file = open(self.clean_csv_path, 'a')
-        place_geoIds = df['id'].apply(convert_to_place_dcid)
+        column_headers = list(df.columns.values)
+        if 'Geography' in column_headers:
+            place_geoIds = df['Geography'].apply(convert_to_place_dcid)
+        else:
+            place_geoIds = df['id'].apply(convert_to_place_dcid)
 
         # update the clean csv
         for column in df.columns.tolist():
@@ -126,3 +157,43 @@ def _process_dataframe(self, df, filename):
         print(
             f" Completed with { self.counter_dict[year]['number of observations'] } observation for { self.counter_dict[year]['number of unique StatVars with observations'] } StatVars at { self.counter_dict[year]['number of unique geos'] } places. ",
             flush=True)
+
+
+def set_column_map(input_path, spec_path, output_dir):
+    generated_col_map = process_zip_file(input_path,
+                                         spec_path,
+                                         write_output=False)
+    f = open(os.path.join(output_dir, 'column_map.json'), 'w')
+    json.dump(generated_col_map, f, indent=4)
+    f.close()
+
+
+def main(argv):
+    option = FLAGS.option.lower()
+    table_prefix = FLAGS.table_prefix
+    spec_path = FLAGS.spec_path
+    input_path = FLAGS.input_path
+    output_dir = FLAGS.output_dir
+    has_percent = FLAGS.has_percent
+    debug = FLAGS.debug
+
+    set_column_map(input_path, spec_path, output_dir)
+    data_loader = S1603SubjectTableDataLoader(table_id='S1603',
+                                              col_delimiter='!!',
+                                              has_percent=True,
+                                              debug=True,
+                                              output_path_dir=output_dir,
+                                              json_spec=spec_path,
+                                              column_map_path=os.path.join(
+                                                  output_dir,
+                                                  'column_map.json'),
+                                              decimal_places=3,
+                                              estimate_period=5,
+                                              header_row=1)
+    data_loader._process_zip_file(input_path)
+
+
+if __name__ == '__main__':
+    flags.mark_flags_as_required(
+        ['table_prefix', 'spec_path', 'input_path', 'output_dir'])
+    app.run(main)
diff --git a/.../subject_tables/S1603/testdata/ACSST5Y2023.S1603_data_with_overlays_1111-11-11T111111.zip b/.../subject_tables/S1603/testdata/ACSST5Y2023.S1603_data_with_overlays_1111-11-11T111111.zip
diff --git a/scripts/us_census/acs5yr/subject_tables/S1603/testdata/alabama.zip b/scripts/us_census/acs5yr/subject_tables/S1603/testdata/alabama.zip
-Original file line number
+Diff line change
@@ Expand Up / @@ -301,4 +301,5 @@ @@
         "Percent speak only English at home MOE",
         "PERCENT IMPUTED"
       ]
-    }
+    }