Skip to content

Commit

Permalink
core: Add AdditionalCharsets with BOM supports, resolve #67
Browse files Browse the repository at this point in the history
  • Loading branch information
TheElectronWill committed Feb 26, 2024
1 parent 4aa7f95 commit 68502b9
Show file tree
Hide file tree
Showing 4 changed files with 290 additions and 22 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
package com.electronwill.nightconfig.core.io;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

/** Additional charsets to help parse configurations in an unknown encoding. */
public final class AdditionalCharsets {
private AdditionalCharsets() {
assert false;
}

/**
* UTF-8 charset with an optional Byte-Order Mark at the beginning.
*
* @see StandardCharsets.UTF_8
*/
public static final Charset UTF_8_BOM = new CharsetUnicodeBom(true);

/**
* A charset that decodes UTF-8 (with or without BOM) or UTF-16 (with BOM), and encodes UTF-8.
*
* @see StandardCharsets.UTF_8
* @see StandardCharsets.UTF_16
*/
public static final Charset UTF_8_OR_16 = new CharsetUnicodeBom(false);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
package com.electronwill.nightconfig.core.io;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.StandardCharsets;

/**
* A special charset that supports reading UTF-8 with an optional BOM, and UTF-16 with a BOM.
* Encoding is always done with UTF-8, without BOM.
*/
class CharsetUnicodeBom extends Charset {
private boolean utf8Only;

protected CharsetUnicodeBom(boolean utf8Only) {
super(utf8Only ? "UTF-8" : "UTF-8-autodetect", new String[] { "UTF-8" });
}

@Override
public boolean canEncode() {
return true;
}

@Override
public boolean contains(Charset cs) {
return StandardCharsets.UTF_8.contains(cs);
}

@Override
public CharsetDecoder newDecoder() {
return new Decoder(this);
}

@Override
public CharsetEncoder newEncoder() {
return new Encoder(this);
}

private static final class Decoder extends CharsetDecoder {
private boolean utf8Only;
private CharsetDecoder decoder = null;

Decoder(CharsetUnicodeBom cs) {
super(cs, 0.5f, 1.0f);
this.utf8Only = cs.utf8Only;
}

private void setupDecoder(Charset detectedCharset) {
this.decoder = detectedCharset.newDecoder()
.onMalformedInput(this.malformedInputAction())
.onUnmappableCharacter(this.unmappableCharacterAction())
.replaceWith(this.replacement());
}

@Override
public boolean isAutoDetecting() {
return true;
}

@Override
public boolean isCharsetDetected() {
return decoder != null;
}

@Override
public Charset detectedCharset() {
return decoder.charset();
}

@Override
protected CoderResult implFlush(CharBuffer out) {
return decoder.flush(out);
}

@Override
protected void implReset() {
decoder.reset();
}

@Override
protected CoderResult decodeLoop(ByteBuffer in, CharBuffer out) {
// Byte-Order Mark detection
if (decoder == null) {
int newPosition = in.position();
try {
if (in.remaining() >= 2) {
int b1 = in.get() & 0xff;
int b2 = in.get() & 0xff;
// detect UTF-16 BE and LE BOMs: wrong encoding!
if (b1 == 0xFE && b2 == 0xFF) {
if (utf8Only) {
throw new ParsingException(
"Invalid input: it begins with an UTF-16 BE byte-order mark, but it should be plain UTF-8.");
}
setupDecoder(StandardCharsets.UTF_16BE);
newPosition += 2;
} else if (b1 == 0xFF && b2 == 0xFE) {
if (utf8Only) {
throw new ParsingException(
"Invalid input: it begins with an UTF-16 LE byte-order mark, but it should be plain UTF-8.");
}
setupDecoder(StandardCharsets.UTF_16LE);
newPosition += 2;
} else if (b1 == 0xEF && b2 == 0xBB) {
if (in.hasRemaining()) {
int b3 = in.get() & 0xff;
if (b3 == 0xBF) {
// UTF-8 BOM "EF BB BF" detected! skip it
newPosition += 3;
}
setupDecoder(StandardCharsets.UTF_8);
} else {
return CoderResult.UNDERFLOW;
}
} else {
setupDecoder(StandardCharsets.UTF_8);
}
} else {
return CoderResult.UNDERFLOW;
}
} finally {
in.position(newPosition);
}
}
// normal decoding
return decoder.decode(in, out, false);
}
}

private static final class Encoder extends CharsetEncoder {
private final CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder();

Encoder(CharsetUnicodeBom cs) {
super(cs, 1.1f, 3.0f);
}

@Override
public boolean canEncode(char c) {
return encoder.canEncode(c);
}

@Override
public boolean canEncode(CharSequence cs) {
return encoder.canEncode(cs);
}

@Override
public boolean isLegalReplacement(byte[] repl) {
return encoder.isLegalReplacement(repl);
}

@Override
protected void implReset() {
encoder.reset();
}

@Override
protected CoderResult implFlush(ByteBuffer out) {
return encoder.flush(out);
}

@Override
protected CoderResult encodeLoop(CharBuffer in, ByteBuffer out) {
return encoder.encode(in, out, false);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,7 @@ static String parseKey(CharacterInput input, char firstChar, TomlParser parser)
throw new ParsingException("Empty bare keys aren't allowed.");
}
if (!Toml.isValidBareKey(bareKey, parser.isLenientWithBareKeys())) {
throw new ParsingException("Invalid bare key: " + bareKey);
throw new ParsingException("Invalid bare key: \'" + bareKey + "\'");
}
return bareKey;
}
Expand Down
Loading

0 comments on commit 68502b9

Please sign in to comment.