diff --git a/pdf-itext-samples/build.gradle b/pdf-itext-samples/build.gradle index c117bf4..2b7b695 100644 --- a/pdf-itext-samples/build.gradle +++ b/pdf-itext-samples/build.gradle @@ -10,7 +10,9 @@ sourceCompatibility = JavaVersion.VERSION_17 targetCompatibility = JavaVersion.VERSION_17 dependencies { - api("com.itextpdf:itextpdf:5.5.13.3") +// api("com.itextpdf:itextpdf:5.5.13.3") + api("com.itextpdf:itext-core:8.0.3") + api("org.apache.commons:commons-lang3:3.14.0") } diff --git a/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractor.java b/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractor.java index d222b38..0eddb13 100644 --- a/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractor.java +++ b/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractor.java @@ -40,13 +40,14 @@ private static boolean sentenceIsValidToBeConsidered( StringBuilder buff, int minWordLength, char c ) { return metMinLength(buff, minWordLength) && (!CharUtils.isAsciiAlpha(c) - || Character.isWhitespace(buff.charAt(buff.length() - 1))); + || Character.isWhitespace(buff.charAt(buff.length() - 1))); } private static boolean isNonTextAfterTheStartOfTheSentence(StringBuilder buff, char c) { - return ( - CharUtils.isAsciiNumeric(c) || Character.isWhitespace(c) - || c == '-' || c == '_' - ) && !buff.isEmpty(); + return !(c == '\r' || c == '\n') + && ( + CharUtils.isAsciiNumeric(c) || Character.isWhitespace(c) + || c == '-' || c == '_' + ) && !buff.isEmpty(); } } diff --git a/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UppercasePhraseExtractorApp.java b/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UppercasePhraseExtractorApp.java index 941def0..179d507 100644 --- a/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UppercasePhraseExtractorApp.java +++ b/pdf-itext-samples/src/main/java/com/mageddo/itext/pdftextextraction/UppercasePhraseExtractorApp.java @@ -4,9 +4,10 @@ import java.util.ArrayList; import java.util.List; -import com.itextpdf.text.pdf.PdfReader; -import com.itextpdf.text.pdf.parser.PdfReaderContentParser; -import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfReader; +import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor; +import com.itextpdf.kernel.pdf.canvas.parser.listener.SimpleTextExtractionStrategy; import static com.mageddo.itext.pdftextextraction.UpperCaseWordsExtractor.extractUppercaseWordsSentenceFromText; @@ -19,31 +20,32 @@ public static void main(String[] args) throws IOException { } private static void extractUppercaseWordSequencesFromPdf(final String fileName) throws IOException { - final var reader = new PdfReader(fileName); - final var parser = new PdfReaderContentParser(reader); - final var strategy = new SimpleTextExtractionStrategy(); + + final var document = new PdfDocument(new PdfReader(fileName)); try { - final var sentences = extractUppercaseWordSequencesFromPdf(reader, parser, strategy); + final var sentences = extractUppercaseWordSequencesFromPdf(document); System.out.println(sentences); } finally { - reader.close(); + document.close(); } } private static List extractUppercaseWordSequencesFromPdf( - PdfReader reader, - PdfReaderContentParser parser, - SimpleTextExtractionStrategy strategy - ) throws IOException { + PdfDocument document + ) { final var sentences = new ArrayList(); - for (int i = 1; i <= reader.getNumberOfPages(); i++) { - if(i == 445) { - parser.processContent(i, strategy); -// sentences.addAll(extractUppercaseWordsSentenceFromText(strategy.getResultantText())); - final var s = extractUppercaseWordsSentenceFromText(strategy.getResultantText(), 2); - System.out.println(s); - System.out.println("page: " + i); - } + for (int i = 1; i <= document.getNumberOfPages(); i++) { + // se nao criar o strategy dentro da interação ele acumula as palavras de todas as páginas + // executadas no retorno do texto + final var strategy = new SimpleTextExtractionStrategy(); + final var text = PdfTextExtractor.getTextFromPage(document.getPage(i), strategy); +// sentences.addAll(extractUppercaseWordsSentenceFromText(text)); + final var s = extractUppercaseWordsSentenceFromText(text); + System.out.println("--------------------------------------"); + System.out.println(s); + System.out.println("page: " + i); + System.out.println("====================================="); + } return sentences; } diff --git a/pdf-itext-samples/src/test/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractorTest.java b/pdf-itext-samples/src/test/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractorTest.java index 6ed12a0..70f9f6a 100644 --- a/pdf-itext-samples/src/test/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractorTest.java +++ b/pdf-itext-samples/src/test/java/com/mageddo/itext/pdftextextraction/UpperCaseWordsExtractorTest.java @@ -46,4 +46,23 @@ void mustIgnoreWordsShorterThanMinSpecifiedLength(){ } + @Test + void mustIgnoreWordsSeparatedByNewLines(){ + + // arrange + final var str = """ + The SUN + IS yellow. + """; + + // act + final var words = extractUppercaseWordsSentenceFromText(str, 2); + + // assert + assertNotNull(words); + assertEquals(""" + [SUN, IS]""", words.toString()); + + } + }