From 0135edb5c1b94960ce9db42748888aad260a8337 Mon Sep 17 00:00:00 2001 From: Gabe Fierro Date: Thu, 27 Mar 2014 10:35:36 -0700 Subject: [PATCH 01/33] remove unnecessary options from config parsing --- lib/config_parser.py | 2 -- process.cfg | 5 ----- start.py | 12 ++++-------- 3 files changed, 4 insertions(+), 15 deletions(-) diff --git a/lib/config_parser.py b/lib/config_parser.py index 5105fac6..5bea9ac2 100755 --- a/lib/config_parser.py +++ b/lib/config_parser.py @@ -50,8 +50,6 @@ def extract_process_options(handler, config_section): result['parse'] = handler.get('process','parse') result['clean'] = handler.get('process','clean') == 'True' result['consolidate'] = handler.get('process','consolidate') == 'True' - result['outputdir'] = handler.get('process','outputdir') - result['lowmemory'] = handler.get('process','lowmemory') == 'True' result['doctype'] = handler.get(config_section,'doctype') return result diff --git a/process.cfg b/process.cfg index 2de36308..0c85e309 100755 --- a/process.cfg +++ b/process.cfg @@ -6,20 +6,15 @@ # parse: defines which parsing configuration will be run # clean: if True, runs the cleaning step on the output of parse # consolidate: if True, runs the conslidation step on the output of clean -# outputdir: specifies the final destination of the resulting sqlite3 files # doctype: can be grant, application, or all, and processing will proceed accordingly. # Note: make sure that the value for grantregex and/or applicationregex # is defined if you wish to use a value other than the default for either -# lowmemory: if True, runs the clean/consolidation scripts in such a way that -# they require less memory, but are slower and possibly less accurate [process] parse=download clean=True consolidate=True -outputdir=. doctype=all -lowmemory=True #[defaultparse] ## 'datadir' specifies the path to the directory containing the XML files that diff --git a/start.py b/start.py index 1af57db8..7249c79c 100755 --- a/start.py +++ b/start.py @@ -140,9 +140,7 @@ def run_clean(process_config): if not process_config['clean']: return doctype = process_config['doctype'] - command = 'python clean.py' - if process_config['lowmemory']: - command = 'bash run_clean.sh' + command = 'bash run_clean.sh' if doctype in ['all', 'grant']: os.system(command + ' grant') if doctype in ['all', 'application']: @@ -152,9 +150,8 @@ def run_consolidate(process_config): if not process_config['consolidate']: return doctype = process_config['doctype'] - command = 'python consolidate.py' - if process_config['lowmemory']: - command = 'bash run_consolidation.sh' + # TODO: optionally include previous disambiguation + command = 'bash run_consolidate.sh' if doctype in ['all', 'grant']: os.system(command + ' grant') if doctype in ['all', 'application']: @@ -204,7 +201,6 @@ def run_consolidate(process_config): .format(parse_config['applicationregex'], parse_config['datadir'], len(files)) print 'Finished parsing in {0}'.format(str(f-s)) - # run extra phases if needed, then move output files + # run extra phases if needed run_clean(process_config) run_consolidate(process_config) - parse.move_tables(process_config['outputdir']) From d13015c63e363ebbb370e6ed5355698532c399d3 Mon Sep 17 00:00:00 2001 From: Gabe Fierro Date: Thu, 27 Mar 2014 10:35:50 -0700 Subject: [PATCH 02/33] make join on previous disambig output optional --- consolidate.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/consolidate.py b/consolidate.py index 794f2b34..944d6b6f 100755 --- a/consolidate.py +++ b/consolidate.py @@ -156,17 +156,18 @@ def join(oldfile, newfile): merged.to_csv('disambiguator_{0}.tsv'.format(datetime.now().strftime('%B_%d')), index=False, header=None, sep='\t') if __name__ == '__main__': - if len(sys.argv) < 2: - print "Provide path to previous disambiguation output" - pritn "USAGE: python consolidate.py " - sys.exit(1) - prev_output = sys.argv[1] for year in range(1975, datetime.today().year+1): - print 'Running year',year,datetime.now(),'for grant' - main(year, 'grant') + print 'Running year',year,datetime.now(),'for grant' + main(year, 'grant') for year in range(2001, datetime.today().year+1): - print 'Running year',year,datetime.now(),'for application' - main(year, 'application') + print 'Running year',year,datetime.now(),'for application' + main(year, 'application') - # join files - join(prev_output, 'disambiguator.csv') + if len(sys.argv) < 2: + print "Provide path to previous disambiguation output" + print "USAGE: python consolidate.py " + print "Not joining on previous records" + else: + prev_output = sys.argv[1] + # join files + join(prev_output, 'disambiguator.csv') From 3bb0881c5d202c6f20bf6325bcaab33da7fe2ff5 Mon Sep 17 00:00:00 2001 From: Gabe Fierro Date: Thu, 27 Mar 2014 10:59:31 -0700 Subject: [PATCH 03/33] insert most recent location for disambiguated inventor/assignee into linking tables --- integrate.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/integrate.py b/integrate.py index 5b03a049..258333db 100644 --- a/integrate.py +++ b/integrate.py @@ -143,6 +143,7 @@ def integrate(disambig_input_file, disambig_output_file): assigneelocation = assigneelocation[assigneelocation[0].notnull()] assigneelocation = assigneelocation[assigneelocation[1].notnull()] assigneelocation.columns = ['location_id','assignee_id'] + assigneelocation = assigneelocation.drop_duplicates(cols='assignee_id',take_last=True) locationassignee_inserts = [row[1].to_dict() for row in assigneelocation.iterrows()] if doctype == 'grant': bulk_commit_inserts(locationassignee_inserts, alchemy.schema.locationassignee, alchemy.is_mysql(), 20000, 'grant') @@ -159,7 +160,7 @@ def integrate(disambig_input_file, disambig_output_file): inventorlocation = inventorlocation[inventorlocation[0].notnull()] inventorlocation = inventorlocation[inventorlocation[1].notnull()] inventorlocation.columns = ['location_id','inventor_id'] - inventorlocation = inventorlocation.drop_duplicates(cols=['location_id','inventor_id']) + inventorlocation = inventorlocation.drop_duplicates(cols='inventor_id',take_last=True) locationinventor_inserts = [row[1].to_dict() for row in inventorlocation.iterrows()] if doctype == 'grant': bulk_commit_inserts(locationinventor_inserts, alchemy.schema.locationinventor, alchemy.is_mysql(), 20000, 'grant') From 69beee9a70361c8b484cc5d9e6e3e8f5f25592c9 Mon Sep 17 00:00:00 2001 From: Gabe Fierro Date: Thu, 27 Mar 2014 11:23:27 -0700 Subject: [PATCH 04/33] fix typo --- start.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/start.py b/start.py index 7249c79c..44b63cf6 100755 --- a/start.py +++ b/start.py @@ -151,7 +151,7 @@ def run_consolidate(process_config): return doctype = process_config['doctype'] # TODO: optionally include previous disambiguation - command = 'bash run_consolidate.sh' + command = 'bash run_consolidation.sh' if doctype in ['all', 'grant']: os.system(command + ' grant') if doctype in ['all', 'application']: From c695ded71b926e3dbb13f32c6df64c09f4742cfb Mon Sep 17 00:00:00 2001 From: Gabe Fierro Date: Thu, 27 Mar 2014 11:23:49 -0700 Subject: [PATCH 05/33] add Vagrant config --- vm/Vagrantfile | 40 ++++++++++++++++++++++++++++++++++++++++ vm/manifests/default.pp | 18 ++++++++++++++++++ 2 files changed, 58 insertions(+) create mode 100644 vm/Vagrantfile create mode 100644 vm/manifests/default.pp diff --git a/vm/Vagrantfile b/vm/Vagrantfile new file mode 100644 index 00000000..a79506a0 --- /dev/null +++ b/vm/Vagrantfile @@ -0,0 +1,40 @@ +# -*- mode: ruby -*- +# vi: set ft=ruby : +$script = <