Skip to content

Commit

Permalink
new test case
Browse files Browse the repository at this point in the history
  • Loading branch information
mageddo committed Apr 1, 2024
1 parent cab0b84 commit 9103476
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 26 deletions.
4 changes: 3 additions & 1 deletion pdf-itext-samples/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ sourceCompatibility = JavaVersion.VERSION_17
targetCompatibility = JavaVersion.VERSION_17

dependencies {
api("com.itextpdf:itextpdf:5.5.13.3")
// api("com.itextpdf:itextpdf:5.5.13.3")
api("com.itextpdf:itext-core:8.0.3")

api("org.apache.commons:commons-lang3:3.14.0")

}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,14 @@ private static boolean sentenceIsValidToBeConsidered(
StringBuilder buff, int minWordLength, char c
) {
return metMinLength(buff, minWordLength) && (!CharUtils.isAsciiAlpha(c)
|| Character.isWhitespace(buff.charAt(buff.length() - 1)));
|| Character.isWhitespace(buff.charAt(buff.length() - 1)));
}

private static boolean isNonTextAfterTheStartOfTheSentence(StringBuilder buff, char c) {
return (
CharUtils.isAsciiNumeric(c) || Character.isWhitespace(c)
|| c == '-' || c == '_'
) && !buff.isEmpty();
return !(c == '\r' || c == '\n')
&& (
CharUtils.isAsciiNumeric(c) || Character.isWhitespace(c)
|| c == '-' || c == '_'
) && !buff.isEmpty();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import java.util.ArrayList;
import java.util.List;

import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.kernel.pdf.PdfDocument;
import com.itextpdf.kernel.pdf.PdfReader;
import com.itextpdf.kernel.pdf.canvas.parser.PdfTextExtractor;
import com.itextpdf.kernel.pdf.canvas.parser.listener.SimpleTextExtractionStrategy;

import static com.mageddo.itext.pdftextextraction.UpperCaseWordsExtractor.extractUppercaseWordsSentenceFromText;

Expand All @@ -19,31 +20,32 @@ public static void main(String[] args) throws IOException {
}

private static void extractUppercaseWordSequencesFromPdf(final String fileName) throws IOException {
final var reader = new PdfReader(fileName);
final var parser = new PdfReaderContentParser(reader);
final var strategy = new SimpleTextExtractionStrategy();

final var document = new PdfDocument(new PdfReader(fileName));
try {
final var sentences = extractUppercaseWordSequencesFromPdf(reader, parser, strategy);
final var sentences = extractUppercaseWordSequencesFromPdf(document);
System.out.println(sentences);
} finally {
reader.close();
document.close();
}
}

private static List<String> extractUppercaseWordSequencesFromPdf(
PdfReader reader,
PdfReaderContentParser parser,
SimpleTextExtractionStrategy strategy
) throws IOException {
PdfDocument document
) {
final var sentences = new ArrayList<String>();
for (int i = 1; i <= reader.getNumberOfPages(); i++) {
if(i == 445) {
parser.processContent(i, strategy);
// sentences.addAll(extractUppercaseWordsSentenceFromText(strategy.getResultantText()));
final var s = extractUppercaseWordsSentenceFromText(strategy.getResultantText(), 2);
System.out.println(s);
System.out.println("page: " + i);
}
for (int i = 1; i <= document.getNumberOfPages(); i++) {
// se nao criar o strategy dentro da interação ele acumula as palavras de todas as páginas
// executadas no retorno do texto
final var strategy = new SimpleTextExtractionStrategy();
final var text = PdfTextExtractor.getTextFromPage(document.getPage(i), strategy);
// sentences.addAll(extractUppercaseWordsSentenceFromText(text));
final var s = extractUppercaseWordsSentenceFromText(text);
System.out.println("--------------------------------------");
System.out.println(s);
System.out.println("page: " + i);
System.out.println("=====================================");

}
return sentences;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,23 @@ void mustIgnoreWordsShorterThanMinSpecifiedLength(){

}

@Test
void mustIgnoreWordsSeparatedByNewLines(){

// arrange
final var str = """
The SUN
IS yellow.
""";

// act
final var words = extractUppercaseWordsSentenceFromText(str, 2);

// assert
assertNotNull(words);
assertEquals("""
[SUN, IS]""", words.toString());

}

}

0 comments on commit 9103476

Please sign in to comment.