Skip to content

Commit

Permalink
[6.4.0] Fix handling of non-ASCII characters in archive entry file na…
Browse files Browse the repository at this point in the history
…mes (#19765)

When creating a `PathFragment` from a ZIP or TAR entry file name, the
raw bytes of the name are now wrapped into a Latin-1 encoded String,
which is how Bazel internally represents file paths.

Previously, ZIP entries as well as TAR entries with PAX headers would
result in ordinary decoded Java strings, resulting in corrupted file
names when passed to Bazel's file system operations.

Fixes #12986

Fixes bazelbuild/rules_go#2771

Closes #18448.

PiperOrigin-RevId: 571857847
Change-Id: Ie578724e75ddbefbe05255601b0afab706835f89

Fixes #19671
  • Loading branch information
fmeum committed Oct 9, 2023
1 parent ddc3e52 commit b90ab8b
Show file tree
Hide file tree
Showing 8 changed files with 301 additions and 60 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ java_library(
"//src/main/java/com/google/devtools/common/options",
"//src/main/java/net/starlark/java/eval",
"//third_party:apache_commons_compress",
"//third_party:auto_service",
"//third_party:auto_value",
"//third_party:flogger",
"//third_party:guava",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
package com.google.devtools.build.lib.bazel.repository;

import static com.google.devtools.build.lib.bazel.repository.StripPrefixedPath.maybeDeprefixSymlink;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;

import com.google.common.base.Optional;
import com.google.auto.service.AutoService;
import com.google.common.io.ByteStreams;
import com.google.devtools.build.lib.bazel.repository.DecompressorValue.Decompressor;
import com.google.devtools.build.lib.vfs.FileSystemUtils;
Expand All @@ -25,16 +27,33 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.spi.CharsetProvider;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.UUID;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;

/**
* Common code for unarchiving a compressed TAR file.
*
* <p>TAR file entries commonly use one of two formats: PAX, which uses UTF-8 encoding for all
* strings, and USTAR, which does not specify an encoding. This class interprets USTAR headers as
* latin-1, thus preserving the original bytes of the header without enforcing any particular
* encoding. Internally, for file system operations, all strings are converted into Bazel's internal
* representation of raw bytes stored as latin-1 strings.
*/
public abstract class CompressedTarFunction implements Decompressor {
protected abstract InputStream getDecompressorStream(DecompressorDescriptor descriptor)
Expand All @@ -54,20 +73,23 @@ public Path decompress(DecompressorDescriptor descriptor)
Map<Path, PathFragment> symlinks = new HashMap<>();

try (InputStream decompressorStream = getDecompressorStream(descriptor)) {
TarArchiveInputStream tarStream = new TarArchiveInputStream(decompressorStream);
// USTAR tar headers use an unspecified encoding whereas PAX tar headers always use UTF-8.
// We can specify the encoding to use for USTAR headers, but the Charset used for PAX headers
// is fixed to UTF-8. We thus specify a custom Charset for the former so that we can
// distinguish between the two.
TarArchiveInputStream tarStream =
new TarArchiveInputStream(decompressorStream, MarkedIso88591Charset.NAME);
TarArchiveEntry entry;
while ((entry = tarStream.getNextTarEntry()) != null) {
String entryName = entry.getName();
String entryName = toRawBytesString(entry.getName());
entryName = renameFiles.getOrDefault(entryName, entryName);
StripPrefixedPath entryPath = StripPrefixedPath.maybeDeprefix(entryName, prefix);
StripPrefixedPath entryPath =
StripPrefixedPath.maybeDeprefix(entryName.getBytes(ISO_8859_1), prefix);
foundPrefix = foundPrefix || entryPath.foundPrefix();

if (prefix.isPresent() && !foundPrefix) {
Optional<String> suggestion =
CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment());
if (suggestion.isPresent()) {
availablePrefixes.add(suggestion.get());
}
CouldNotFindPrefixException.maybeMakePrefixSuggestion(entryPath.getPathFragment())
.ifPresent(availablePrefixes::add);
}

if (entryPath.skip()) {
Expand All @@ -80,8 +102,11 @@ public Path decompress(DecompressorDescriptor descriptor)
filePath.createDirectoryAndParents();
} else {
if (entry.isSymbolicLink() || entry.isLink()) {
PathFragment targetName = PathFragment.create(entry.getLinkName());
targetName = maybeDeprefixSymlink(targetName, prefix, descriptor.destinationPath());
PathFragment targetName =
maybeDeprefixSymlink(
toRawBytesString(entry.getLinkName()).getBytes(ISO_8859_1),
prefix,
descriptor.destinationPath());
if (entry.isSymbolicLink()) {
symlinks.put(filePath, targetName);
} else {
Expand Down Expand Up @@ -135,4 +160,100 @@ public Path decompress(DecompressorDescriptor descriptor)

return descriptor.destinationPath();
}

/**
* Returns a string that contains the raw bytes of the given string encoded in ISO-8859-1,
* assuming that the given string was encoded with either UTF-8 or the special {@link
* MarkedIso88591Charset}.
*/
private static String toRawBytesString(String name) {
// Marked strings are already encoded in ISO-8859-1. Other strings originate from PAX headers
// and are thus encoded in UTF-8, which we decode to the raw bytes and then re-encode trivially
// in ISO-8859-1.
return MarkedIso88591Charset.getRawBytesStringIfMarked(name)
.orElseGet(() -> new String(name.getBytes(UTF_8), ISO_8859_1));
}

/** A provider of {@link MarkedIso88591Charset}s. */
@AutoService(CharsetProvider.class)
public static class MarkedIso88591CharsetProvider extends CharsetProvider {
private static final Charset CHARSET = new MarkedIso88591Charset();

@Override
public Iterator<Charset> charsets() {
// This charset is only meant for internal use within CompressedTarFunction and thus should
// not be discoverable.
return Collections.emptyIterator();
}

@Override
public Charset charsetForName(String charsetName) {
return MarkedIso88591Charset.NAME.equals(charsetName) ? CHARSET : null;
}
}

/**
* A charset that decodes ISO-8859-1, i.e., produces a String that contains the raw decoded bytes,
* and appends a marker to the end of the string to indicate that it was decoded with this
* charset.
*/
private static class MarkedIso88591Charset extends Charset {
// The name
// * must not collide with the name of any other charset.
// * must not appear in archive entry names by chance.
// * is internal to CompressedTarFunction.
// This is best served by a cryptographically random UUID, generated at startup.
private static final String NAME = UUID.randomUUID().toString();

private MarkedIso88591Charset() {
super(NAME, new String[0]);
}

public static Optional<String> getRawBytesStringIfMarked(String s) {
// Check for the marker in all positions as TarArchiveInputStream manipulates the raw name in
// certain cases (for example, appending a '/' to directory names).
if (s.contains(NAME)) {
return Optional.of(s.replaceAll(NAME, ""));
}
return Optional.empty();
}

@Override
public CharsetDecoder newDecoder() {
return new CharsetDecoder(this, 1, 1) {
@Override
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
// A simple unoptimized ISO-8859-1 decoder.
while (in.hasRemaining()) {
if (!out.hasRemaining()) {
return CoderResult.OVERFLOW;
}
out.put((char) (in.get() & 0xFF));
}
return CoderResult.UNDERFLOW;
}

@Override
protected CoderResult implFlush(CharBuffer out) {
// Append the marker to the end of the buffer to indicate that it was decoded with this
// charset.
if (out.remaining() < NAME.length()) {
return CoderResult.OVERFLOW;
}
out.put(NAME);
return CoderResult.UNDERFLOW;
}
};
}

@Override
public CharsetEncoder newEncoder() {
throw new UnsupportedOperationException();
}

@Override
public boolean contains(Charset cs) {
return false;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
package com.google.devtools.build.lib.bazel.repository;

import com.google.auto.value.AutoValue;
import com.google.common.base.Optional;
import com.google.common.collect.ImmutableMap;
import com.google.devtools.build.lib.vfs.Path;
import java.util.Map;
import java.util.Optional;

/** Description of an archive to be decompressed. */
@AutoValue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,17 @@

package com.google.devtools.build.lib.bazel.repository;

import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.UTF_8;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Optional;
import com.google.devtools.build.lib.rules.repository.RepositoryFunction.RepositoryFunctionException;
import com.google.devtools.build.lib.vfs.Path;
import com.google.devtools.build.lib.vfs.PathFragment;
import com.google.devtools.build.skyframe.SkyFunctionException.Transience;
import com.google.devtools.build.skyframe.SkyValue;
import java.io.IOException;
import java.util.Optional;
import java.util.Set;
import net.starlark.java.eval.Starlark;

Expand Down Expand Up @@ -59,9 +62,14 @@ private static String prepareErrorMessage(String prefix, Set<String> availablePr
}

public static Optional<String> maybeMakePrefixSuggestion(PathFragment pathFragment) {
return pathFragment.isMultiSegment()
? Optional.of(pathFragment.getSegment(0))
: Optional.absent();
if (!pathFragment.isMultiSegment()) {
return Optional.empty();
}
String rawFirstSegment = pathFragment.getSegment(0);
// Users can only specify prefixes from Starlark, which is planned to use UTF-8 for all
// strings, but currently still collects the raw bytes in a latin-1 string. We thus
// optimistically decode the raw bytes with UTF-8 here for display purposes.
return Optional.of(new String(rawFirstSegment.getBytes(ISO_8859_1), UTF_8));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@

package com.google.devtools.build.lib.bazel.repository;

import com.google.common.base.Optional;
import static java.nio.charset.StandardCharsets.ISO_8859_1;

import com.google.common.base.Preconditions;
import com.google.devtools.build.lib.concurrent.ThreadSafety;
import com.google.devtools.build.lib.vfs.Path;
import com.google.devtools.build.lib.vfs.PathFragment;
import java.util.Optional;

/**
* Utility class for removing a prefix from an archive's path.
Expand All @@ -36,17 +38,19 @@ public final class StripPrefixedPath {
* could cause collisions, if a zip file had one entry for bin/some-binary and another entry for
* /bin/some-binary.
*
* Note that the prefix is stripped to move the files up one level, so if you have an entry
* <p>Note that the prefix is stripped to move the files up one level, so if you have an entry
* "foo/../bar" and a prefix of "foo", the result will be "bar" not "../bar".
*/
public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> prefix) {
public static StripPrefixedPath maybeDeprefix(byte[] entry, Optional<String> prefix) {
Preconditions.checkNotNull(entry);
PathFragment entryPath = relativize(entry);
if (!prefix.isPresent()) {
if (prefix.isEmpty()) {
return new StripPrefixedPath(entryPath, false, false);
}

PathFragment prefixPath = relativize(prefix.get());
// Bazel parses Starlark files, which are the ultimate source of prefixes, as Latin-1
// (ISO-8859-1).
PathFragment prefixPath = relativize(prefix.get().getBytes(ISO_8859_1));
boolean found = false;
boolean skip = false;
if (entryPath.startsWith(prefixPath)) {
Expand All @@ -64,8 +68,8 @@ public static StripPrefixedPath maybeDeprefix(String entry, Optional<String> pre
/**
* Normalize the path and, if it is absolute, make it relative (e.g., /foo/bar becomes foo/bar).
*/
private static PathFragment relativize(String path) {
PathFragment entryPath = PathFragment.create(path);
private static PathFragment relativize(byte[] path) {
PathFragment entryPath = createPathFragment(path);
if (entryPath.isAbsolute()) {
entryPath = entryPath.toRelative();
}
Expand All @@ -79,10 +83,10 @@ private StripPrefixedPath(PathFragment pathFragment, boolean found, boolean skip
}

public static PathFragment maybeDeprefixSymlink(
PathFragment linkPathFragment, Optional<String> prefix, Path root) {
boolean wasAbsolute = linkPathFragment.isAbsolute();
byte[] rawTarget, Optional<String> prefix, Path root) {
boolean wasAbsolute = createPathFragment(rawTarget).isAbsolute();
// Strip the prefix from the link path if set.
linkPathFragment = maybeDeprefix(linkPathFragment.getPathString(), prefix).getPathFragment();
PathFragment linkPathFragment = maybeDeprefix(rawTarget, prefix).getPathFragment();
if (wasAbsolute) {
// Recover the path to an absolute path as maybeDeprefix() relativize the path
// even if the prefix is not set
Expand All @@ -103,4 +107,10 @@ public boolean skip() {
return skip;
}

static PathFragment createPathFragment(byte[] rawBytes) {
// Bazel internally represents paths as raw bytes by using the Latin-1 encoding, which has the
// property that (new String(bytes, ISO_8859_1)).getBytes(ISO_8859_1)) equals bytes for every
// byte array bytes.
return PathFragment.create(new String(rawBytes, ISO_8859_1));
}
}
Loading

0 comments on commit b90ab8b

Please sign in to comment.