[6.4.0] Fix handling of non-ASCII characters in archive entry file na…

…mes (#19765) When creating a `PathFragment` from a ZIP or TAR entry file name, the raw bytes of the name are now wrapped into a Latin-1 encoded String, which is how Bazel internally represents file paths. Previously, ZIP entries as well as TAR entries with PAX headers would result in ordinary decoded Java strings, resulting in corrupted file names when passed to Bazel's file system operations. Fixes #12986 Fixes bazelbuild/rules_go#2771 Closes #18448. PiperOrigin-RevId: 571857847 Change-Id: Ie578724e75ddbefbe05255601b0afab706835f89 Fixes #19671
bazelbuild · Oct 9, 2023 · b90ab8b · b90ab8b
1 parent ddc3e52
commit b90ab8b
Show file tree

Hide file tree

Showing 8 changed files with 301 additions and 60 deletions.
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD b/src/main/java/com/google/devtools/build/lib/bazel/repository/BUILD
@@ -49,6 +49,7 @@ java_library(
         "//src/main/java/com/google/devtools/common/options",
         "//src/main/java/net/starlark/java/eval",
         "//third_party:apache_commons_compress",
+        "//third_party:auto_service",
         "//third_party:auto_value",
         "//third_party:flogger",
         "//third_party:guava",

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/CompressedTarFunction.java
@@ -15,8 +15,10 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
 
-import com.google.common.base.Optional;
+import com.google.auto.service.AutoService;
 import com.google.common.io.ByteStreams;
 import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor;
 import com.google.devtools.build.lib.vfs.FileSystemUtils;
@@ -25,16 +27,33 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.nio.charset.CoderResult;
+import java.nio.charset.spi.CharsetProvider;
+import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.Map;
+import java.util.Optional;
 import java.util.Set;
+import java.util.UUID;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
 import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
 
 /**
  * Common code for unarchiving a compressed TAR file.
+ *
+ * <p>TAR file entries commonly use one of two formats: PAX, which uses UTF-8 encoding for all
+ * strings, and USTAR, which does not specify an encoding. This class interprets USTAR headers as
+ * latin-1, thus preserving the original bytes of the header without enforcing any particular
+ * encoding. Internally, for file system operations, all strings are converted into Bazel's internal
+ * representation of raw bytes stored as latin-1 strings.
  */
 public abstract class CompressedTarFunction implements Decompressor {
   protected abstract InputStream getDecompressorStream(DecompressorDescriptor descriptor)
@@ -54,20 +73,23 @@ public Path decompress(DecompressorDescriptor descriptor)
     Map<Path, PathFragment> symlinks = new HashMap<>();
 
     try (InputStream decompressorStream = getDecompressorStream(descriptor)) {
-      TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream);
+      // USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8.
+      // We can specify the encoding to use for USTAR headers, but the Charset used for PAX headers
+      // is fixed to UTF-8. We thus specify a custom Charset for the former so that we can
+      // distinguish between the two.
+      TarArchiveInputStream tarStream =
+          new TarArchiveInputStream(decompressorStream, MarkedIso88591Charset.NAME);
       TarArchiveEntry entry;
       while ((entry = tarStream.getNextTarEntry()) != null) {
-        String entryName = entry.getName();
+        String entryName = toRawBytesString(entry.getName());
         entryName = renameFiles.getOrDefault(entryName, entryName);
-        StripPrefixedPath entryPath = StripPrefixedPath.maybeDeprefix(entryName, prefix);
+        StripPrefixedPath entryPath =
+            StripPrefixedPath.maybeDeprefix(entryName.getBytes(ISO_8859_1), prefix);
         foundPrefix = foundPrefix || entryPath.foundPrefix();
 
         if (prefix.isPresent() && !foundPrefix) {
-          Optional<String> suggestion =
-              CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment());
-          if (suggestion.isPresent()) {
-            availablePrefixes.add(suggestion.get());
-          }
+          CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment())
+              .ifPresent(availablePrefixes::add);
         }
 
         if (entryPath.skip()) {
@@ -80,8 +102,11 @@ public Path decompress(DecompressorDescriptor descriptor)
           filePath.createDirectoryAndParents();
         } else {
           if (entry.isSymbolicLink() || entry.isLink()) {
-            PathFragment targetName = PathFragment.create(entry.getLinkName());
-            targetName = maybeDeprefixSymlink(targetName, prefix, descriptor.destinationPath());
+            PathFragment targetName =
+                maybeDeprefixSymlink(
+                    toRawBytesString(entry.getLinkName()).getBytes(ISO_8859_1),
+                    prefix,
+                    descriptor.destinationPath());
             if (entry.isSymbolicLink()) {
               symlinks.put(filePath, targetName);
             } else {
@@ -135,4 +160,100 @@ public Path decompress(DecompressorDescriptor descriptor)
 
     return descriptor.destinationPath();
   }
+
+  /**
+   * Returns a string that contains the raw bytes of the given string encoded in ISO-8859-1,
+   * assuming that the given string was encoded with either UTF-8 or the special {@link
+   * MarkedIso88591Charset}.
+   */
+  private static String toRawBytesString(String name) {
+    // Marked strings are already encoded in ISO-8859-1. Other strings originate from PAX headers
+    // and are thus encoded in UTF-8, which we decode to the raw bytes and then re-encode trivially
+    // in ISO-8859-1.
+    return MarkedIso88591Charset.getRawBytesStringIfMarked(name)
+        .orElseGet(() -> new String(name.getBytes(UTF_8), ISO_8859_1));
+  }
+
+  /** A provider of {@link MarkedIso88591Charset}s. */
+  @AutoService(CharsetProvider.class)
+  public static class MarkedIso88591CharsetProvider extends CharsetProvider {
+    private static final Charset CHARSET = new MarkedIso88591Charset();
+
+    @Override
+    public Iterator<Charset> charsets() {
+      // This charset is only meant for internal use within CompressedTarFunction and thus should
+      // not be discoverable.
+      return Collections.emptyIterator();
+    }
+
+    @Override
+    public Charset charsetForName(String charsetName) {
+      return MarkedIso88591Charset.NAME.equals(charsetName) ? CHARSET : null;
+    }
+  }
+
+  /**
+   * A charset that decodes ISO-8859-1, i.e., produces a String that contains the raw decoded bytes,
+   * and appends a marker to the end of the string to indicate that it was decoded with this
+   * charset.
+   */
+  private static class MarkedIso88591Charset extends Charset {
+    // The name
+    // * must not collide with the name of any other charset.
+    // * must not appear in archive entry names by chance.
+    // * is internal to CompressedTarFunction.
+    // This is best served by a cryptographically random UUID, generated at startup.
+    private static final String NAME = UUID.randomUUID().toString();
+
+    private MarkedIso88591Charset() {
+      super(NAME, new String[0]);
+    }
+
+    public static Optional<String> getRawBytesStringIfMarked(String s) {
+      // Check for the marker in all positions as TarArchiveInputStream manipulates the raw name in
+      // certain cases (for example, appending a '/' to directory names).
+      if (s.contains(NAME)) {
+        return Optional.of(s.replaceAll(NAME, ""));
+      }
+      return Optional.empty();
+    }
+
+    @Override
+    public CharsetDecoder newDecoder() {
+      return new CharsetDecoder(this, 1, 1) {
+        @Override
+        protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
+          // A simple unoptimized ISO-8859-1 decoder.
+          while (in.hasRemaining()) {
+            if (!out.hasRemaining()) {
+              return CoderResult.OVERFLOW;
+            }
+            out.put((char) (in.get() & 0xFF));
+          }
+          return CoderResult.UNDERFLOW;
+        }
+
+        @Override
+        protected CoderResult implFlush(CharBuffer out) {
+          // Append the marker to the end of the buffer to indicate that it was decoded with this
+          // charset.
+          if (out.remaining() < NAME.length()) {
+            return CoderResult.OVERFLOW;
+          }
+          out.put(NAME);
+          return CoderResult.UNDERFLOW;
+        }
+      };
+    }
+
+    @Override
+    public CharsetEncoder newEncoder() {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public boolean contains(Charset cs) {
+      return false;
+    }
+  }
 }
diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorDescriptor.java
@@ -15,10 +15,10 @@
 package com.google.devtools.build.lib.bazel.repository;
 
 import com.google.auto.value.AutoValue;
-import com.google.common.base.Optional;
 import com.google.common.collect.ImmutableMap;
 import com.google.devtools.build.lib.vfs.Path;
 import java.util.Map;
+import java.util.Optional;
 
 /** Description of an archive to be decompressed. */
 @AutoValue

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/DecompressorValue.java
@@ -14,14 +14,17 @@
 
 package com.google.devtools.build.lib.bazel.repository;
 
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+import static java.nio.charset.StandardCharsets.UTF_8;
+
 import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Optional;
 import com.google.devtools.build.lib.rules.repository.RepositoryFunction.RepositoryFunctionException;
 import com.google.devtools.build.lib.vfs.Path;
 import com.google.devtools.build.lib.vfs.PathFragment;
 import com.google.devtools.build.skyframe.SkyFunctionException.Transience;
 import com.google.devtools.build.skyframe.SkyValue;
 import java.io.IOException;
+import java.util.Optional;
 import java.util.Set;
 import net.starlark.java.eval.Starlark;
 
@@ -59,9 +62,14 @@ private static String prepareErrorMessage(String prefix, Set<String> availablePr
       }
 
       public static Optional<String> maybeMakePrefixSuggestion(PathFragment pathFragment) {
-        return pathFragment.isMultiSegment()
-            ? Optional.of(pathFragment.getSegment(0))
-            : Optional.absent();
+        if (!pathFragment.isMultiSegment()) {
+          return Optional.empty();
+        }
+        String rawFirstSegment = pathFragment.getSegment(0);
+        // Users can only specify prefixes from Starlark, which is planned to use UTF-8 for all
+        // strings, but currently still collects the raw bytes in a latin-1 string. We thus
+        // optimistically decode the raw bytes with UTF-8 here for display purposes.
+        return Optional.of(new String(rawFirstSegment.getBytes(ISO_8859_1), UTF_8));
       }
     }
 

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/StripPrefixedPath.java
@@ -14,11 +14,13 @@
 
 package com.google.devtools.build.lib.bazel.repository;
 
-import com.google.common.base.Optional;
+import static java.nio.charset.StandardCharsets.ISO_8859_1;
+
 import com.google.common.base.Preconditions;
 import com.google.devtools.build.lib.concurrent.ThreadSafety;
 import com.google.devtools.build.lib.vfs.Path;
 import com.google.devtools.build.lib.vfs.PathFragment;
+import java.util.Optional;
 
 /**
  * Utility class for removing a prefix from an archive's path.
@@ -36,17 +38,19 @@ public final class StripPrefixedPath {
    * could cause collisions, if a zip file had one entry for bin/some-binary and another entry for
    * /bin/some-binary.
    *
-   * Note that the prefix is stripped to move the files up one level, so if you have an entry
+   * <p>Note that the prefix is stripped to move the files up one level, so if you have an entry
    * "foo/../bar" and a prefix of "foo", the result will be "bar" not "../bar".
    */
-  public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> prefix) {
+  public static StripPrefixedPath maybeDeprefix(byte[] entry, Optional<String> prefix) {
     Preconditions.checkNotNull(entry);
     PathFragment entryPath = relativize(entry);
-    if (!prefix.isPresent()) {
+    if (prefix.isEmpty()) {
       return new StripPrefixedPath(entryPath, false, false);
     }
 
-    PathFragment prefixPath = relativize(prefix.get());
+    // Bazel parses Starlark files, which are the ultimate source of prefixes, as Latin-1
+    // (ISO-8859-1).
+    PathFragment prefixPath = relativize(prefix.get().getBytes(ISO_8859_1));
     boolean found = false;
     boolean skip = false;
     if (entryPath.startsWith(prefixPath)) {
@@ -64,8 +68,8 @@ public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> pre
   /**
    * Normalize the path and, if it is absolute, make it relative (e.g., /foo/bar becomes foo/bar).
    */
-  private static PathFragment relativize(String path) {
-    PathFragment entryPath = PathFragment.create(path);
+  private static PathFragment relativize(byte[] path) {
+    PathFragment entryPath = createPathFragment(path);
     if (entryPath.isAbsolute()) {
       entryPath = entryPath.toRelative();
     }
@@ -79,10 +83,10 @@ private StripPrefixedPath(PathFragment pathFragment, boolean found, boolean skip
   }
 
   public static PathFragment maybeDeprefixSymlink(
-      PathFragment linkPathFragment, Optional<String> prefix, Path root) {
-    boolean wasAbsolute = linkPathFragment.isAbsolute();
+      byte[] rawTarget, Optional<String> prefix, Path root) {
+    boolean wasAbsolute = createPathFragment(rawTarget).isAbsolute();
     // Strip the prefix from the link path if set.
-    linkPathFragment = maybeDeprefix(linkPathFragment.getPathString(), prefix).getPathFragment();
+    PathFragment linkPathFragment = maybeDeprefix(rawTarget, prefix).getPathFragment();
     if (wasAbsolute) {
       // Recover the path to an absolute path as maybeDeprefix() relativize the path
       // even if the prefix is not set
@@ -103,4 +107,10 @@ public boolean skip() {
     return skip;
   }
 
+  static PathFragment createPathFragment(byte[] rawBytes) {
+    // Bazel internally represents paths as raw bytes by using the Latin-1 encoding, which has the
+    // property that (new String(bytes, ISO_8859_1)).getBytes(ISO_8859_1)) equals bytes for every
+    // byte array bytes.
+    return PathFragment.create(new String(rawBytes, ISO_8859_1));
+  }
 }