hortonworks · mukund-thakur · Nov 1, 2018 · Nov 5, 2018 · Nov 6, 2018 · Dec 6, 2018
diff --git a/README.md b/README.md
@@ -1,16 +1,28 @@
 # bigdata-interop
 
+[![GitHub release](https://img.shields.io/github/release/GoogleCloudPlatform/bigdata-interop.svg)](https://github.com/GoogleCloudPlatform/bigdata-interop/releases/latest)
+[![GitHub release date](https://img.shields.io/github/release-date/GoogleCloudPlatform/bigdata-interop.svg)](https://github.com/GoogleCloudPlatform/bigdata-interop/releases/latest)
+[![Code Quality: Java](https://img.shields.io/lgtm/grade/java/g/GoogleCloudPlatform/bigdata-interop.svg?logo=lgtm&logoWidth=18)](https://lgtm.com/projects/g/GoogleCloudPlatform/bigdata-interop/context:java)
+
 Libraries and tools for interoperability between Apache Hadoop related
 open-source software and Google Cloud Platform.
 
 ## Google Cloud Storage connector for Apache Hadoop
 
+[![Maven Central](https://img.shields.io/maven-central/v/com.google.cloud.bigdataoss/gcs-connector/hadoop1.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:com.google.cloud.bigdataoss%20AND%20a:gcs-connector%20AND%20v:hadoop1-*)
+[![Maven Central](https://img.shields.io/maven-central/v/com.google.cloud.bigdataoss/gcs-connector/hadoop2.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:com.google.cloud.bigdataoss%20AND%20a:gcs-connector%20AND%20v:hadoop2-*)
+[![Maven Central](https://img.shields.io/maven-central/v/com.google.cloud.bigdataoss/gcs-connector/hadoop3.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:com.google.cloud.bigdataoss%20AND%20a:gcs-connector%20AND%20v:hadoop3-*)
+
 The Google Cloud Storage connector for Hadoop enables running MapReduce jobs
 directly on data in Google Cloud Storage by implementing the Hadoop FileSystem
 interface. For details, see [the README](gcs/README.md).
 
 ## Google BigQuery connector for Apache Hadoop MapReduce
 
+[![Maven Central](https://img.shields.io/maven-central/v/com.google.cloud.bigdataoss/bigquery-connector/hadoop1.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:com.google.cloud.bigdataoss%20AND%20a:bigquery-connector%20AND%20v:hadoop1-*)
+[![Maven Central](https://img.shields.io/maven-central/v/com.google.cloud.bigdataoss/bigquery-connector/hadoop2.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:com.google.cloud.bigdataoss%20AND%20a:bigquery-connector%20AND%20v:hadoop2-*)
+[![Maven Central](https://img.shields.io/maven-central/v/com.google.cloud.bigdataoss/bigquery-connector/hadoop3.svg?label=Maven%20Central)](https://search.maven.org/search?q=g:com.google.cloud.bigdataoss%20AND%20a:bigquery-connector%20AND%20v:hadoop3-*)
+
 The Google BigQuery connector for Hadoop MapReduce enables running MapReduce
 jobs on data in BigQuery by implementing the InputFormat & OutputFormat
 interfaces. For more details see
@@ -24,9 +36,8 @@ For more details see [the README](pubsub/README.md)
 
 ## Building the Cloud Storage (GCS) and BigQuery connectors
 
-All the connectors can be built with Apache Maven 3 (as of 2018-08-07, version
-3.5.4 has been tested). To build the connector for specific Hadoop version, run
-the following commands from the main directory:
+To build the connector for specific Hadoop version, run the following commands
+from the main directory:
 
 ```bash
 # with Hadoop 1 support:
@@ -58,9 +69,17 @@ To add a dependency on one of the connectors using Maven, use the following:
   <groupId>com.google.cloud.bigdataoss</groupId>
   <!-- Cloud Storage: -->
   <artifactId>gcs-connector</artifactId>
-  <version>hadoop2-1.9.10</version>
+  <version>hadoop2-1.9.16</version>
   <!-- or, for BigQuery: -->
   <artifactId>bigquery-connector</artifactId>
-  <version>hadoop2-0.13.10</version>
+  <version>hadoop2-0.13.16</version>
 </dependency>
 ```
+
+## Resources
+
+On **Stack Overflow**, use the tag
+[`google-cloud-dataproc`](https://stackoverflow.com/tags/google-cloud-dataproc)
+for questions about the connectors in this repository. This tag receives
+responses from the Stack Overflow community and Google engineers, who monitor
+the tag and offer unofficial support.
diff --git a/bigquery/CHANGES.txt b/bigquery/CHANGES.txt
@@ -1,3 +1,50 @@
+0.13.17 - 2019-05-15
+
+  1. POM updates for GCS connector 1.9.17.
+
+  2. Support nested record type in field schema in BigQuery connector.
+
+  3. Add a property to specify BigQuery tables partitioning definition:
+
+       mapred.bq.output.table.partitioning
+
+
+0.13.16 - 2019-02-25
+
+  1. POM updates for GCS connector 1.9.16.
+
+
+0.13.15 - 2019-02-21
+
+  1. POM updates for GCS connector 1.9.15.
+
+
+0.13.14 - 2019-02-13
+
+  1. POM updates for GCS connector 1.9.14.
+
+
+0.13.13 - 2019-02-04
+
+  1. POM updates for GCS connector 1.9.13.
+
+
+0.13.12 - 2019-01-30
+
+  1. POM updates for GCS connector 1.9.12.
+
+  2. Improve exception message for BigQuery job execution errors.
+
+  3. Update all dependencies to latest versions.
+
+
+0.13.11 - 2018-12-20
+
+  1. POM updates for GCS connector 1.9.11.
+
+  2. Update all dependencies to latest versions.
+
+
 0.13.10 - 2018-11-01
 
   1. POM updates for GCS connector 1.9.10.

diff --git a/bigquery/pom.xml b/bigquery/pom.xml
@@ -22,7 +22,7 @@
   <parent>
     <groupId>com.google.cloud.bigdataoss</groupId>
     <artifactId>bigdataoss-parent</artifactId>
-    <version>1.9.10</version>
+    <version>1.9.17</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
@@ -32,7 +32,6 @@
   </description>
 
   <artifactId>bigquery-connector</artifactId>
-  <version>${hadoop.identifier}-0.13.10</version>
 
   <profiles>
     <profile>
@@ -183,7 +182,7 @@
     <dependency>
       <groupId>com.google.cloud.bigdataoss</groupId>
       <artifactId>util-hadoop</artifactId>
-      <version>${hadoop.identifier}-${bigdataoss.version}</version>
+      <version>${bigdataoss.version}</version>
     </dependency>
     <dependency>
       <groupId>com.google.cloud.bigdataoss</groupId>
@@ -195,12 +194,12 @@
     <dependency>
       <groupId>com.google.cloud.bigdataoss</groupId>
       <artifactId>gcs-connector</artifactId>
-      <version>${hadoop.identifier}-${bigdataoss.version}</version>
+      <version>${bigdataoss.version}</version>
     </dependency>
     <dependency>
       <groupId>com.google.cloud.bigdataoss</groupId>
       <artifactId>gcs-connector</artifactId>
-      <version>${hadoop.identifier}-${bigdataoss.version}</version>
+      <version>${bigdataoss.version}</version>
       <classifier>tests</classifier>
       <scope>test</scope>
     </dependency>

diff --git a/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryConfiguration.java b/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryConfiguration.java
@@ -129,6 +129,12 @@ public class BigQueryConfiguration {
    */
   public static final String OUTPUT_TABLE_SCHEMA_KEY = "mapred.bq.output.table.schema";
 
+  /**
+   * Configuration key for the output table partitioning used by the output format. This key is
+   * stored as a {@link String}.
+   */
+  public static final String OUTPUT_TABLE_PARTITIONING_KEY = "mapred.bq.output.table.partitioning";
+
   /**
    * Configuration key for the Cloud KMS encryption key that will be used to protect output BigQuery
    * table. This key is stored as a {@link String}.

diff --git a/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryHelper.java b/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryHelper.java
@@ -13,6 +13,8 @@
  */
 package com.google.cloud.hadoop.io.bigquery;
 
+import static com.google.common.flogger.LazyArgs.lazy;
+
 import com.google.api.services.bigquery.Bigquery;
 import com.google.api.services.bigquery.Bigquery.Jobs.Insert;
 import com.google.api.services.bigquery.model.Dataset;
@@ -26,6 +28,7 @@
 import com.google.api.services.bigquery.model.Table;
 import com.google.api.services.bigquery.model.TableReference;
 import com.google.api.services.bigquery.model.TableSchema;
+import com.google.api.services.bigquery.model.TimePartitioning;
 import com.google.cloud.hadoop.util.ApiErrorExtractor;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
@@ -141,19 +144,62 @@ public void importFromGcs(
       List<String> gcsPaths,
       boolean awaitCompletion)
       throws IOException, InterruptedException {
+    importFromGcs(
+        projectId,
+        tableRef,
+        schema,
+        /* timePartitioning= */ null,
+        kmsKeyName,
+        sourceFormat,
+        writeDisposition,
+        gcsPaths,
+        awaitCompletion);
+  }
+
+  /**
+   * Imports data from GCS into BigQuery via a load job. Optionally polls for completion before
+   * returning.
+   *
+   * @param projectId the project on whose behalf to perform the load.
+   * @param tableRef the reference to the destination table.
+   * @param schema the schema of the source data to populate the destination table by.
+   * @param timePartitioning time partitioning to populate the destination table.
+   * @param kmsKeyName the Cloud KMS encryption key used to protect the output table.
+   * @param sourceFormat the file format of the source data.
+   * @param writeDisposition the write disposition of the output table.
+   * @param gcsPaths the location of the source data in GCS.
+   * @param awaitCompletion if true, block and poll until job completes, otherwise return as soon as
+   *     the job has been successfully dispatched.
+   * @throws IOException
+   * @throws InterruptedException if interrupted while waiting for job completion.
+   */
+  public void importFromGcs(
+      String projectId,
+      TableReference tableRef,
+      @Nullable TableSchema schema,
+      @Nullable TimePartitioning timePartitioning,
+      @Nullable String kmsKeyName,
+      BigQueryFileFormat sourceFormat,
+      String writeDisposition,
+      List<String> gcsPaths,
+      boolean awaitCompletion)
+      throws IOException, InterruptedException {
     logger.atInfo().log(
-        "Importing into table '%s' from %s paths; path[0] is '%s'; awaitCompletion: %s",
-        BigQueryStrings.toString(tableRef),
+        "Importing into table '%s' from %s paths; path[0] is '%s'; awaitCompletion: %s;"
+            + " timePartitioning: %s",
+        lazy(() -> BigQueryStrings.toString(tableRef)),
         gcsPaths.size(),
         gcsPaths.isEmpty() ? "(empty)" : gcsPaths.get(0),
-        awaitCompletion);
+        awaitCompletion,
+        timePartitioning);
 
     // Create load conf with minimal requirements.
     JobConfigurationLoad loadConfig = new JobConfigurationLoad();
     loadConfig.setSchema(schema);
     loadConfig.setSourceFormat(sourceFormat.getFormatIdentifier());
     loadConfig.setSourceUris(gcsPaths);
     loadConfig.setDestinationTable(tableRef);
+    loadConfig.setTimePartitioning(timePartitioning);
     loadConfig.setWriteDisposition(writeDisposition);
     if (!Strings.isNullOrEmpty(kmsKeyName)) {
       loadConfig.setDestinationEncryptionConfiguration(

diff --git a/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryOutputCommitter.java b/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryOutputCommitter.java
@@ -210,7 +210,8 @@ public void commitTask(TaskAttemptContext context) throws IOException {
     // Run the job.
     logger.atFine().log(
         "commitTask: Running table copy from %s to %s",
-        BigQueryStrings.toString(tempTableRef), BigQueryStrings.toString(finalTableRef));
+        lazy(() -> BigQueryStrings.toString(tempTableRef)),
+        lazy(() -> BigQueryStrings.toString(finalTableRef)));
     Job response = bigQueryHelper.insertJobOrFetchDuplicate(projectId, job);
     logger.atFine().log("Got response '%s'", response);
 

diff --git a/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryUtils.java b/bigquery/src/main/java/com/google/cloud/hadoop/io/bigquery/BigQueryUtils.java
@@ -13,6 +13,8 @@
  */
 package com.google.cloud.hadoop.io.bigquery;
 
+import static com.google.common.base.Preconditions.checkArgument;
+
 import com.google.api.client.util.BackOff;
 import com.google.api.client.util.ExponentialBackOff;
 import com.google.api.client.util.Sleeper;
@@ -23,7 +25,6 @@
 import com.google.api.services.bigquery.model.TableFieldSchema;
 import com.google.cloud.hadoop.util.ResilientOperation;
 import com.google.cloud.hadoop.util.RetryDeterminer;
-import com.google.common.base.Preconditions;
 import com.google.common.flogger.GoogleLogger;
 import com.google.gson.JsonArray;
 import com.google.gson.JsonElement;
@@ -35,9 +36,7 @@
 import java.util.concurrent.TimeUnit;
 import org.apache.hadoop.util.Progressable;
 
-/**
- * Helper methods to interact with BigQuery.
- */
+/** Helper methods to interact with BigQuery. */
 public class BigQueryUtils {
   private static final GoogleLogger logger = GoogleLogger.forEnclosingClass();
 
@@ -61,15 +60,11 @@ public class BigQueryUtils {
    * @param projectId the project that is polling.
    * @param jobReference the job to poll.
    * @param progressable to get progress of task.
-   *
    * @throws IOException on IO Error.
    * @throws InterruptedException on sleep interrupt.
    */
   public static void waitForJobCompletion(
-      Bigquery bigquery,
-      String projectId,
-      JobReference jobReference,
-      Progressable progressable)
+      Bigquery bigquery, String projectId, JobReference jobReference, Progressable progressable)
       throws IOException, InterruptedException {
 
     Sleeper sleeper = Sleeper.DEFAULT;
@@ -88,16 +83,19 @@ public static void waitForJobCompletion(
     // While job is incomplete continue to poll.
     while (notDone) {
       BackOff operationBackOff = new ExponentialBackOff.Builder().build();
-      Get get = bigquery.jobs()
-          .get(projectId, jobReference.getJobId())
-          .setLocation(jobReference.getLocation());
-
-      Job pollJob = ResilientOperation.retry(
-          ResilientOperation.getGoogleRequestCallable(get),
-          operationBackOff,
-          RetryDeterminer.RATE_LIMIT_ERRORS,
-          IOException.class,
-          sleeper);
+      Get get =
+          bigquery
+              .jobs()
+              .get(projectId, jobReference.getJobId())
+              .setLocation(jobReference.getLocation());
+
+      Job pollJob =
+          ResilientOperation.retry(
+              ResilientOperation.getGoogleRequestCallable(get),
+              operationBackOff,
+              RetryDeterminer.RATE_LIMIT_ERRORS,
+              IOException.class,
+              sleeper);
 
       elapsedTime = System.currentTimeMillis() - startTime;
       logger.atFine().log(
@@ -106,16 +104,16 @@ public static void waitForJobCompletion(
       if (pollJob.getStatus().getState().equals("DONE")) {
         notDone = false;
         if (pollJob.getStatus().getErrorResult() != null) {
-          throw new IOException(pollJob.getStatus().getErrorResult().getMessage());
+          throw new IOException(
+              "Error during BigQuery job execution: " + pollJob.getStatus().getErrorResult());
         }
       } else {
         long millisToWait = pollBackOff.nextBackOffMillis();
         if (millisToWait == BackOff.STOP) {
           throw new IOException(
               String.format(
                   "Job %s failed to complete after %s millis.",
-                  jobReference.getJobId(),
-                  elapsedTime));
+                  jobReference.getJobId(), elapsedTime));
         }
         // Pause execution for the configured duration before polling job status again.
         Thread.sleep(millisToWait);
@@ -137,18 +135,20 @@ public static List<TableFieldSchema> getSchemaFromString(String fields) {
     // Parse the output schema for Json from fields.
     JsonParser jsonParser = new JsonParser();
     JsonArray json = jsonParser.parse(fields).getAsJsonArray();
-    List<TableFieldSchema> fieldsList =  new ArrayList<>();
+    List<TableFieldSchema> fieldsList = new ArrayList<>();
 
     // For each item in the list of fields.
     for (JsonElement jsonElement : json) {
-      Preconditions.checkArgument(jsonElement.isJsonObject(),
-          "Expected JsonObject for element, got '%s'.", jsonElement);
+      checkArgument(
+          jsonElement.isJsonObject(), "Expected JsonObject for element, got '%s'.", jsonElement);
       JsonObject jsonObject = jsonElement.getAsJsonObject();
 
       // Set the name and type.
-      Preconditions.checkArgument(jsonObject.get("name") != null,
+      checkArgument(
+          jsonObject.get("name") != null,
           "Expected non-null entry for key 'name' in JsonObject '%s'", jsonObject);
-      Preconditions.checkArgument(jsonObject.get("type") != null,
+      checkArgument(
+          jsonObject.get("type") != null,
           "Expected non-null entry for key 'type' in JsonObject '%s'", jsonObject);
       TableFieldSchema fieldDef = new TableFieldSchema();
       fieldDef.setName(jsonObject.get("name").getAsString());
@@ -161,7 +161,7 @@ public static List<TableFieldSchema> getSchemaFromString(String fields) {
 
       // If the type is RECORD set the fields.
       if (jsonObject.get("type").getAsString().equals("RECORD")) {
-        Preconditions.checkArgument(
+        checkArgument(
             jsonObject.get("fields") != null,
             "Expected non-null entry for key 'fields' in JsonObject of type RECORD: '%s'",
             jsonObject);