Adding HDFS support for data generation (#188)

* Changes for adding hdfs submitter class Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Working changes to GenTable for hdfs run Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Changes for mapReduce Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Changes to makefile Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Adding comments to the Java file Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Removed redundant files Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Correcting typo in query stream Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Review changes Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Fixing typo in example command Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Changes for supporting json_summary+sub_queries -s Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> * Correcting typo and README Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> --------- Signed-off-by: Sayed Bilal Bari <sbari@nvidia.com> Co-authored-by: Sayed Bilal Bari <sbari@nvidia.com>
NVIDIA · Jul 3, 2024 · c41b702 · c41b702
1 parent 32fa7c6
commit c41b702
Show file tree

Hide file tree

Showing 8 changed files with 688 additions and 57 deletions.
diff --git a/nds-h/README.md b/nds-h/README.md
@@ -103,8 +103,9 @@ To generate data for local -
 
 ```bash
 $ python nds_h_gen_data.py -h
-usage: nds_h_gen_data.py [-h] <scale> <parallel> <data_dir> [--overwrite_output]
+usage: nds_h_gen_data.py [-h] {local,hdfs} <scale> <parallel> <data_dir> [--overwrite_output]
 positional arguments:
+  {local,hdfs}        file system to save the generated data.
   scale               volume of data to generate in GB.
   parallel            build data in <parallel_value> separate chunks
   data_dir            generate data in directory.
@@ -118,7 +119,7 @@ optional arguments:
 Example command:
 
 ```bash
-python nds_h_gen_data.py 100 100 /data/raw_sf100 --overwrite_output
+python nds_h_gen_data.py hdfs 100 100 /data/raw_sf100 --overwrite_output
 ```
 
 ### Convert DSV to Parquet or Other data sources
@@ -219,6 +220,11 @@ optional arguments:
                         type of query output
   --property_file PROPERTY_FILE
                         property file for Spark configuration.
+  --json_summary_folder JSON_SUMMARY_FOLDER
+                        Empty folder/path (will create if not exist) to save JSON summary file for each query.
+  --sub_queries SUB_QUERIES
+                        comma separated list of queries to run. If not specified, all queries in the stream file will be run. 
+                        e.g. "query1,query2,query3". Note, use "_part1","_part2" and "part_3" suffix for the following query names: query15
 ```
 
 Example command to submit nds_h_power.py by spark-submit-template utility:

diff --git a/nds-h/nds_h_gen_data.py b/nds-h/nds_h_gen_data.py
@@ -34,6 +34,7 @@
 import os
 import sys
 import subprocess
+import shutil
 
 #For adding utils to path
 parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
@@ -58,6 +59,7 @@
     'supplier'
 ]
 
+
 def generate_data_local(args, range_start, range_end, tool_path):
     """Generate data to local file system. TPC-DS tool will generate all table data under target
     folder without creating sub-folders for each table. So we add extra code to create sub folder
@@ -87,10 +89,10 @@ def generate_data_local(args, range_start, range_end, tool_path):
     procs = []
     for i in range(range_start, range_end + 1):
         dbgen = ["-s", args.scale,
-                    "-C", args.parallel,
-                    "-S", str(i),
-                    "-v", "Y",
-                    "-f","Y"]
+                 "-C", args.parallel,
+                 "-S", str(i),
+                 "-v", "Y",
+                 "-f", "Y"]
         procs.append(subprocess.Popen(
             ["./dbgen"] + dbgen, cwd=str(work_dir)))
     # wait for data generation to complete
@@ -104,45 +106,130 @@ def generate_data_local(args, range_start, range_end, tool_path):
     for table in table_names:
         print('mkdir -p {}/{}'.format(data_dir, table))
         subprocess.run(['mkdir', '-p', data_dir + '/' + table])
-        if (table != 'region' and table !='nation'):
+        if (table != 'region' and table != 'nation'):
             for i in range(range_start, range_end + 1):
                 subprocess.run(['mv', f'{work_dir}/{table}.tbl.{i}',
                                 f'{data_dir}/{table}/'], stderr=subprocess.DEVNULL)
         else:
             subprocess.run(['mv', f'{work_dir}/{table}.tbl',
-                                f'{data_dir}/{table}/'], stderr=subprocess.DEVNULL)
+                            f'{data_dir}/{table}/'], stderr=subprocess.DEVNULL)
         # delete date file has no parallel number suffix in the file name, move separately
     # show summary
     subprocess.run(['du', '-h', '-d1', data_dir])
 
+
+def clean_temp_data(temp_data_path):
+    cmd = ['hadoop', 'fs', '-rm', '-r', '-skipTrash', temp_data_path]
+    print(" ".join(cmd))
+    subprocess.run(cmd)
+
+
+def merge_temp_tables(temp_data_path, parent_data_path):
+    """Helper functions for incremental data generation. Move data in temporary child range path to
+    parent directory.
+
+    Args:
+        temp_data_path (str): temorary child range data path
+        parent_data_path (str): parent data path
+    """
+    table_names = source_table_names
+    for table_name in table_names:
+        # manually create table sub-folders
+        # redundant step if it's not the first range part.
+        cmd = ['hadoop', 'fs', '-mkdir', parent_data_path + '/' + table_name]
+        print(" ".join(cmd))
+        subprocess.run(cmd)
+        # move temp content to upper folder
+        # note not all tables are generated in different child range step
+        # please ignore messages like "mv: `.../reason/*': No such file or directory"
+        temp_table_data_path = temp_data_path + '/' + table_name + '/*'
+        cmd = ['hadoop', 'fs', '-mv', temp_table_data_path,
+               parent_data_path + '/' + table_name + '/']
+        print(" ".join(cmd))
+        subprocess.run(cmd)
+    clean_temp_data(temp_data_path)
+
+
+def generate_data_hdfs(args, jar_path):
+    """generate data to hdfs using TPC-DS dsdgen tool. Support incremental generation: due to the
+    limit of hdfs, each range data will be generated under a temporary folder then move to target
+    folder.
+
+    Args:
+        args (Namespace): Namespace from argparser
+        jar_path (str): path to the target jar
+
+    Raises:
+        Exception: if Hadoop binary is not installed.
+    """
+    # Check if hadoop is installed.
+    if shutil.which('hadoop') is None:
+        raise Exception('No Hadoop binary found in current environment, ' +
+                        'please install Hadoop for data generation in cluster.')
+    # Submit hadoop MR job to generate data
+    cmd = ['hadoop', 'jar', str(jar_path)]
+    cmd += ['-p', args.parallel, '-s', args.scale]
+    # get dsdgen.jar path, assume user won't change file structure
+    tpcds_gen_path = jar_path.parent.parent.absolute()
+    if args.overwrite_output:
+        cmd += ['-o']
+    if args.range:
+        # use a temp folder to save the specific range data.
+        # will move the content to parent folder afterwards.
+        # it's a workaround for "Output directory ... already exists" in incremental generation
+        temp_data_path = args.data_dir + '/_temp_'
+        # before generation, we remove "_temp_" folders in case they contain garbage generated by
+        # previous user runs.
+        clean_temp_data(temp_data_path)
+        cmd.extend(["-r", args.range])
+        cmd.extend(["-d", temp_data_path])
+        try:
+            subprocess.run(cmd, check=True, cwd=str(tpcds_gen_path))
+            # only move delete table for data maintenance
+            merge_temp_tables(temp_data_path, args.data_dir)
+        finally:
+            clean_temp_data(temp_data_path)
+    else:
+        cmd.extend(["-d", args.data_dir])
+        subprocess.run(cmd, check=True, cwd=str(tpcds_gen_path))
+        # only move delete table for data maintenance
+
+
 def generate_data(args):
-    tool_path = check_build_nds_h()
+    jar_path, tool_path = check_build_nds_h()
     range_start = 1
     range_end = int(args.parallel)
     if args.range:
         range_start, range_end = valid_range(args.range, args.parallel)
-    generate_data_local(args, range_start, range_end, tool_path)
+    if args.type == 'hdfs':
+        generate_data_hdfs(args, jar_path)
+    else:
+        generate_data_local(args, range_start, range_end, tool_path)
+
 
 if __name__ == "__main__":
     parser = parser = argparse.ArgumentParser()
+    parser.add_argument("type",
+                        choices=["local", "hdfs"],
+                        help="file system to save the generated data.")
     parser.add_argument("scale",
                         help="volume of data to generate in GB. Accepted SF - 1,10, 100, 300, 1000 \
                             ,3000, 10000, 30000,"
-    )
+                        )
     parser.add_argument("parallel",
                         type=parallel_value_type,
                         help="build data in <parallel_value> separate chunks"
-    )
+                        )
     parser.add_argument("data_dir",
                         help="generate data in directory.")
     parser.add_argument('--range',
                         help='Used for incremental data generation, meaning which part of child' +
-                        'chunks are generated in one run. Format: "start,end", both are inclusive. ' +
-                        'e.g. "1,100". Note: the child range must be within the "parallel", ' +
-                        '"--parallel 100 --range 100,200" is illegal.')
+                             'chunks are generated in one run. Format: "start,end", both are inclusive. ' +
+                             'e.g. "1,100". Note: the child range must be within the "parallel", ' +
+                             '"--parallel 100 --range 100,200" is illegal.')
     parser.add_argument("--overwrite_output",
                         action="store_true",
                         help="overwrite if there has already existing data in the path provided.")
-    
+
     args = parser.parse_args()
     generate_data(args)
diff --git a/nds-h/nds_h_gen_query_stream.py b/nds-h/nds_h_gen_query_stream.py
@@ -81,7 +81,7 @@ def generate_query_streams(args, tool_path):
             subprocess.run(base_cmd, check=True, cwd=str(work_dir),stdout=f)
 
 if __name__ == "__main__":
-    tool_path = check_build_nds_h()
+    jar_path, tool_path = check_build_nds_h()
     parser = parser = argparse.ArgumentParser()
     parser.add_argument("scale",
                         help="Assume a database of this scale factor.")