diff --git a/utilities/Hive_metastore_migration/README.md b/utilities/Hive_metastore_migration/README.md index a20ffe53..999afc08 100644 --- a/utilities/Hive_metastore_migration/README.md +++ b/utilities/Hive_metastore_migration/README.md @@ -186,35 +186,26 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore. 2. Submit the `hive_metastore_migration.py` Spark script to your Spark cluster using the following parameters: - - Set `--direction` to `from_metastore`, or omit the argument since - `from_metastore` is the default. + - Set `--config_file` to `` (default path: `artifacts/config.yaml`) + + - Provide the following configuration parameters in the configuration yaml file: + ``` + * mode + * jdbc-url + * jdbc-username + * jdbc-password + * database-prefix + * table-prefix + ``` - - Provide the JDBC connection information through these arguments: - `--jdbc-url`, `--jdbc-username`, and `--jdbc-password`. - - - The argument `--output-path` is required. It is either a local file system location - or an S3 location. If the output path is a local directory, you can upload the data - to an S3 location manually. If it is an S3 path, you need to make sure that the Spark - cluster has EMRFS library in its class path. The script will export the metadata to a - subdirectory of the output-path you provided. - - - `--database-prefix` and `--table-prefix` (optional) to set a string prefix that is applied to the - database and table names. They are empty by default. - - Example spark-submit command to migrate Hive metastore to S3, tested on EMR-4.7.1: - ```bash + ```bash MYSQL_JAR_PATH=/usr/lib/hadoop/mysql-connector-java-5.1.42-bin.jar DRIVER_CLASSPATH=/home/hadoop/*:/etc/hadoop/conf:/etc/hive/conf:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:$MYSQL_JAR_PATH spark-submit --driver-class-path $DRIVER_CLASSPATH \ --jars $MYSQL_JAR_PATH \ /home/hadoop/hive_metastore_migration.py \ - --mode from-metastore \ - --jdbc-url jdbc:mysql://metastore.foo.us-east-1.rds.amazonaws.com:3306 \ - --jdbc-user hive \ - --jdbc-password myJDBCPassword \ - --database-prefix myHiveMetastore_ \ - --table-prefix myHiveMetastore_ \ - --output-path s3://mybucket/myfolder/ + --config_file artifacts/config.yaml ``` - If the job finishes successfully, it creates 3 sub-folders in the S3 output path you diff --git a/utilities/Hive_metastore_migration/artifacts/config.yaml b/utilities/Hive_metastore_migration/artifacts/config.yaml new file mode 100644 index 00000000..81a82093 --- /dev/null +++ b/utilities/Hive_metastore_migration/artifacts/config.yaml @@ -0,0 +1,8 @@ +mode: +jdbc-url: +jdbc-username: +jdbc-password: +database-prefix: +table-prefix: +output-path: +input_path: \ No newline at end of file diff --git a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py index 16562256..b6447f1d 100644 --- a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py +++ b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py @@ -14,6 +14,7 @@ # except for python 2.7 standard library and Spark 2.1 import sys import argparse +import yaml import re import logging from time import localtime, strftime @@ -1398,6 +1399,39 @@ def parse_arguments(args): return options +def parse_arguments_from_yaml_file(args): + """ + This function accepts the path to a config file + and extracts the needed arguments for the metastore migration + ---------- + Return: + Dictionary of config options + """ + parser = argparse.ArgumentParser(prog=args[0]) + parser.add_argument('-f', '--config_file', required=True, default='artifacts/config.yaml`', help='Provide yaml configuration file path to read migration arguments from. Default path: `artifacts/config.yaml`') + options = get_options(parser, args) + config_file_path = options['config_file'] + ## read the yaml file + with open(config_file_path, 'r') as yaml_file_stream: + config_options = yaml.load(yaml_file_stream) + + if config_options['mode'] == FROM_METASTORE: + validate_options_in_mode( + options=config_options, mode=FROM_METASTORE, + required_options=['output_path'], + not_allowed_options=['input_path'] + ) + elif config_options['mode'] == TO_METASTORE: + validate_options_in_mode( + options=config_options, mode=TO_METASTORE, + required_options=['input_path'], + not_allowed_options=['output_path'] + ) + else: + raise AssertionError('unknown mode ' + options['mode']) + + return config_options + def get_spark_env(): conf = SparkConf() sc = SparkContext(conf=conf) @@ -1501,7 +1535,10 @@ def validate_aws_regions(region): def main(): - options = parse_arguments(sys.argv) + # options = parse_arguments(sys.argv) + + ## This now reads options from path to config yaml file + options = parse_arguments_from_yaml_file(sys.argv) connection = { 'url': options['jdbc_url'],