Bump version to 2.2.3, update Readme and Changlog

Change-Id: Ic4928596d72ce3f738a47f112d8064dc63324f56
KorAP · Sep 6, 2023 · c2f448c · c2f448c
1 parent ced7882
commit c2f448c
Show file tree

Hide file tree

Showing 4 changed files with 21 additions and 9 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,14 @@
 # Changelog
 
+## 2.2.3
+
+* Updated dependencies
+* Minimum Java version raised to 17
+* Fixed group id in pom.xml
+* Removed compile dependency on Maven Surefire
+* Build artifacts in src/main/jflex are now ignored by git
+* java.io's ByteArrayOutputStream used instead of 3rd-party class
+
 ## 2.2.2
 
 * Bug fix: a single quotation mark at the beginning of a word

diff --git a/Readme.md b/Readme.md
@@ -14,8 +14,8 @@ The de-variant is used for the German Reference Corpus DeReKo. Being based on fi
 the tokenizers are potentially not as accurate as language model based ones, but with ~5 billion words per hour typically more efficient.
 An important feature in the DeReKo/KorAP context is also that token character offsets can be reported, which can be used for applying standoff annotations.
 
-The include mplementations of the `KorapTokenizer` interface also implement the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
-and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
+The included implementations of the `KorapTokenizer` interface also implement the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/2.3.0/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
+and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/2.3.0/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
 interfaces and can thus be used as a drop-in replacements in OpenNLP applications.
 
 The underlying scanner is based on the Lucene scanner with modifications from [David Hall](https://github.com/dlwh).
@@ -38,7 +38,7 @@ By default, KorAP tokenizer reads from standard input and writes to standard out
 
 #### Split English text into tokens
 ```
-$ echo "It's working." | java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar -l en
+$ echo "It's working." | java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar -l en
 It
 's
 working
@@ -47,7 +47,7 @@ working
 #### Split French text into tokens and sentences
 ```
 $ echo "C'est une phrase. Ici, il s'agit d'une deuxième phrase." \
-  | java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar -s -l fr
+  | java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar -s -l fr
 C'
 est
 une
@@ -72,7 +72,7 @@ With the `--positions` option, for example, the tokenizer prints all offsets of
 In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
 ```
 $ echo -n -e 'This is a text.\x0a\x04\x0aAnd this is another text.\n\x04\n' |\
-     java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar  --positions
+     java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar  --positions
 This
 is
 a
@@ -90,7 +90,7 @@ text
 #### Print token and sentence offset
 ```
 echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x04\x0aAnd this is another text.'  |\
-   java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar --no-tokens --positions --sentence-boundaries
+   java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar --no-tokens --positions --sentence-boundaries
 1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76
 1 28 29 54 55 76
 0 3 4 8 9 11 12 19 20 24 24 25
@@ -111,7 +111,10 @@ Alternatively, you can also provide `KorAPTokenizer` implementations independent
 * [Marc Kupietz](https://www.ids-mannheim.de/digspra/personal/kupietz.html)
 * [Nils Diewald](https://www.ids-mannheim.de/digspra/personal/diewald.html)
 
-Copyright (c) 2021, [Leibniz Institute for the German Language](http://www.ids-mannheim.de/), Mannheim, Germany
+**Contributor**:
+* [Gregor Middell](https://github.com/gremid)
+
+Copyright (c) 2023, [Leibniz Institute for the German Language](http://www.ids-mannheim.de/), Mannheim, Germany
 
 This package is developed as part of the [KorAP](http://korap.ids-mannheim.de/)
 Corpus Analysis Platform at the Leibniz Institute for German Language

diff --git a/pom.xml b/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>de.ids_mannheim.korap.tokenizer</groupId>
     <artifactId>KorAP-Tokenizer</artifactId>
-    <version>2.2.2</version>
+    <version>2.2.3</version>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
@@ -14,7 +14,7 @@
 import java.util.stream.Collectors;
 
 @CommandLine.Command(mixinStandardHelpOptions = true,
-        name = "koraptokenizer", version = "2.2.1", description = "Tokenizes (and sentence splits) text input.")
+        name = "koraptokenizer", version = "2.2.3", description = "Tokenizes (and sentence splits) text input.")
 public class Main implements Callable<Integer> {
 
     public final String DEFAULT_LANGUAGE = "de";