resolved conflicts for aws-samples#13

manabery · manabery · commit ea1aafbf84fa · 2025-05-27T16:54:27.000+09:00
diff --git a/utilities/Hive_metastore_migration/README.md b/utilities/Hive_metastore_migration/README.md
@@ -191,35 +191,26 @@ as an Glue ETL job, if AWS Glue can directly connect to your Hive metastore.
 2. Submit the `hive_metastore_migration.py` Spark script to your Spark cluster
    using the following parameters:
 
-   - Set `--direction` to `from_metastore`, or omit the argument since
-     `from_metastore` is the default.
+   - Set `--config_file` to `<path_to_your_config_yaml_file>` (default path: `artifacts/config.yaml`)
+
+   - Provide the following configuration parameters in the configuration yaml file:
+   ```
+    * mode
+    * jdbc-url
+    * jdbc-username
+    * jdbc-password
+    * database-prefix
+    * table-prefix
+   ```
 
-   - Provide the JDBC connection information through these arguments:
-     `--jdbc-url`, `--jdbc-username`, and `--jdbc-password`.
-
-   - The argument `--output-path` is required. It is either a local file system location
-     or an S3 location. If the output path is a local directory, you can upload the data
-     to an S3 location manually. If it is an S3 path, you need to make sure that the Spark
-     cluster has EMRFS library in its class path. The script will export the metadata to a
-     subdirectory of the output-path you provided.
-     
-   - `--database-prefix` and `--table-prefix` (optional) to set a string prefix that is applied to the 
-     database and table names. They are empty by default. 
-     
    - Example spark-submit command to migrate Hive metastore to S3, tested on EMR-4.7.1:
-    ```bash
+   ```bash
     MYSQL_JAR_PATH=/usr/lib/hadoop/mysql-connector-java-5.1.42-bin.jar
     DRIVER_CLASSPATH=/home/hadoop/*:/etc/hadoop/conf:/etc/hive/conf:/usr/lib/hadoop-lzo/lib/*:/usr/lib/hadoop/hadoop-aws.jar:/usr/share/aws/aws-java-sdk/*:/usr/share/aws/emr/emrfs/conf:/usr/share/aws/emr/emrfs/lib/*:/usr/share/aws/emr/emrfs/auxlib/*:$MYSQL_JAR_PATH
     spark-submit --driver-class-path $DRIVER_CLASSPATH \
       --jars $MYSQL_JAR_PATH \
       /home/hadoop/hive_metastore_migration.py \
-      --mode from-metastore \
-      --jdbc-url jdbc:mysql://metastore.foo.us-east-1.rds.amazonaws.com:3306 \
-      --jdbc-user hive \
-      --jdbc-password myJDBCPassword \
-      --database-prefix myHiveMetastore_ \
-      --table-prefix myHiveMetastore_ \
-      --output-path s3://mybucket/myfolder/
+      --config_file artifacts/config.yaml
     ```
     
     - If the job finishes successfully, it creates 3 sub-folders in the S3 output path you
diff --git a/utilities/Hive_metastore_migration/artifacts/config.yaml b/utilities/Hive_metastore_migration/artifacts/config.yaml
@@ -0,0 +1,8 @@
+mode:
+jdbc-url:
+jdbc-username:
+jdbc-password:
+database-prefix:
+table-prefix:
+output-path:
+input_path:
diff --git a/utilities/Hive_metastore_migration/src/hive_metastore_migration.py b/utilities/Hive_metastore_migration/src/hive_metastore_migration.py
@@ -9,6 +9,7 @@
 # except for python 2.7 standard library and Spark 2.1
 import sys
 from datetime import datetime, timedelta, tzinfo
+import yaml
 from time import localtime, strftime
 from types import MethodType
 
@@ -1606,6 +1607,39 @@ def parse_arguments(args):
     return options
 
 
+def parse_arguments_from_yaml_file(args):
+    """
+    This function accepts the path to a config file
+    and extracts the needed arguments for the metastore migration
+    ----------
+    Return:
+        Dictionary of config options
+    """
+    parser = argparse.ArgumentParser(prog=args[0])
+    parser.add_argument('-f', '--config_file', required=True, default='artifacts/config.yaml`', help='Provide yaml configuration file path to read migration arguments from. Default path: `artifacts/config.yaml`')
+    options = get_options(parser, args)
+    config_file_path = options['config_file']
+    ## read the yaml file
+    with open(config_file_path, 'r') as yaml_file_stream:
+        config_options = yaml.load(yaml_file_stream)
+
+    if config_options['mode'] == FROM_METASTORE:
+        validate_options_in_mode(
+            options=config_options, mode=FROM_METASTORE,
+            required_options=['output_path'],
+            not_allowed_options=['input_path']
+        )
+    elif config_options['mode'] == TO_METASTORE:
+        validate_options_in_mode(
+            options=config_options, mode=TO_METASTORE,
+            required_options=['input_path'],
+            not_allowed_options=['output_path']
+        )
+    else:
+        raise AssertionError('unknown mode ' + options['mode'])
+
+    return config_options
+
 def get_spark_env():
     try:
         sc = SparkContext.getOrCreate()
@@ -1733,7 +1767,10 @@ def validate_aws_regions(region):
 
 
 def main():
-    options = parse_arguments(sys.argv)
+    # options = parse_arguments(sys.argv)
+
+    ## This now reads options from path to config yaml file
+    options = parse_arguments_from_yaml_file(sys.argv)
 
     connection = {"url": options["jdbc_url"], "user": options["jdbc_username"], "password": options["jdbc_password"]}
     db_prefix = options.get("database_prefix") or ""