Text classification task (#245)

* Add `TextClassificationTask` * Add tokenization and tests for new transforms. * Add notebook for data pipeline Co-authored-by: lorenzoh <lorenz.ohly@gmail.com>
FluxML · Jul 18, 2022 · 2d396e2 · 2d396e2
1 parent dabd150
commit 2d396e2
Show file tree

Hide file tree

Showing 10 changed files with 590 additions and 12 deletions.
diff --git a/FastText/Project.toml b/FastText/Project.toml
@@ -4,10 +4,13 @@ authors = ["Lorenz Ohly", "FluxML Community"]
 version = "0.1.0"
 
 [deps]
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 FastAI = "5d0beca9-ade8-49ae-ad0b-a3cf890e669f"
 InlineTest = "bd334432-b1e7-49c7-a2dc-dd9149e4ebd6"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
+WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 
 [compat]
 FastAI = "0.5"

diff --git a/FastText/src/FastText.jl b/FastText/src/FastText.jl
@@ -16,16 +16,52 @@ using FastAI:
 
 using FastAI.Datasets
 
+using ..FastAI: testencoding
+
+# extending
+import ..FastAI:
+    blockmodel, blockbackbone, blocklossfn, encode, decode, checkblock,
+    encodedblock, decodedblock, showblock!, mockblock, setup, encodestate,
+    decodestate
+
 using InlineTest
 using Random
+using TextAnalysis:
+    StringDocument, prepare!, strip_stopwords, text,
+    strip_html_tags, strip_non_letters, strip_numbers
+using DataStructures: OrderedDict
+
+using WordTokenizers: TokenBuffer, isdone, character, spaces, nltk_url1, nltk_url2, nltk_phonenumbers
+
 
 include("recipes.jl")
 include("blocks/text.jl")
 include("transform.jl")
+include("encodings/textpreprocessing.jl")
+
+const _tasks = Dict{String,Any}()
+include("tasks/classification.jl")
+
+const DEFAULT_SANITIZERS = [
+    replace_all_caps,
+    replace_sentence_case,
+    convert_lowercase,
+    remove_punctuations,
+    basic_preprocessing,
+    remove_extraspaces
+]
+
+const DEFAULT_TOKENIZERS = [tokenize]
 
 function __init__()
     FastAI.Registries.registerrecipes(@__MODULE__, RECIPES)
+    foreach(values(_tasks)) do t
+        if !haskey(FastAI.learningtasks(), t.id)
+            push!(FastAI.learningtasks(), t)
+        end
+    end
 end
 
-export Paragraph
+export Paragraph, TextClassificationSingle, Sanitize, Tokenize
+
 end
diff --git a/FastText/src/blocks/text.jl b/FastText/src/blocks/text.jl
@@ -2,7 +2,7 @@
     Paragraph() <: Block
 
 [`Block`](#) for a text paragraph containing one or more
-sentences (basically, a single observation in the textual dataset). 
+sentences (basically, a single observation in the textual dataset).
 `data` is valid for `Paragraph` if it is of type string.
 
 Example valid Paragraphs:
@@ -26,7 +26,12 @@ FastAI.mockblock(Paragraph())
 struct Paragraph <: Block end
 
 FastAI.checkblock(::Paragraph, ::String) = true
-function FastAI.mockblock(::Paragraph)
-    randstring(" ABCEEFGHIJKLMNOPQESRUVWXYZ 1234567890 abcdefghijklmnopqrstynwxyz\n\t.,",
-               rand(10:40))
-end
+FastAI.mockblock(::Paragraph) = randstring(" ABCEEFGHIJKLMNOPQESRUVWXYZ 1234567890 abcdefghijklmnopqrstynwxyz\n\t.,", rand(10:40))
+
+struct Tokens <: Block end
+
+FastAI.checkblock(::Tokens, ::Vector{String}) = true
+
+struct NumberVector <: Block end
+
+FastAI.checkblock(::NumberVector, ::Vector{Int64}) = true
diff --git a/FastText/src/encodings/textpreprocessing.jl b/FastText/src/encodings/textpreprocessing.jl
@@ -0,0 +1,108 @@
+"""
+    TextEncoding() <: Encoding
+
+Encodes `Paragraph`s by applying various textual transforms.
+
+
+Encodes
+- `Paragraph` -> `Paragraph`
+
+"""
+struct Sanitize <: Encoding
+    tfms
+end
+
+Sanitize() = Sanitize(DEFAULT_SANITIZERS)
+
+
+encodedblock(::Sanitize, block::Paragraph) = block
+
+function encode(p::Sanitize, context, block::Paragraph, obs)
+    for tfm in values(p.tfms)
+        obs = tfm(obs)
+    end
+    obs
+end
+
+struct Tokenize <: Encoding
+    tfms
+end
+
+Tokenize() = Tokenize(DEFAULT_TOKENIZERS)
+
+function encodedblock(p::Tokenize, block::Paragraph)
+    return Tokens()
+end
+
+function encode(p::Tokenize, context, block::Paragraph, obs)
+    for tfm in values(p.tfms)
+        obs = tfm(obs)
+    end
+    obs
+end
+
+function computevocabulary(data)
+    lookup_table = Dict{String, Int}()
+
+    enc1 = Sanitize()
+    sanitized_Data = map(i -> encode(enc1, Training(), Paragraph(), getobs(data, i)[1]), 1:numobs(data))
+
+    enc2 = Tokenize()
+    tokenized_data = map(i -> encode(enc2, Training(), Paragraph(), getobs(sanitized_Data, i)), 1:numobs(data))
+
+    vocab = []
+    for sample in tokenized_data
+        for token in sample
+            lookup_table[token] = get(lookup_table, token, 0) + 1
+        end
+    end
+    return OrderedDict(lookup_table)
+end
+
+struct EmbedVocabulary <: Encoding
+    vocab
+end
+
+function EmbedVocabulary(; vocab)
+    return EmbedVocabulary(vocab)
+end
+
+function setup(::Type{EmbedVocabulary}, data)
+    vocab = computevocabulary(data)
+    return EmbedVocabulary(vocab = vocab)
+end
+
+function encodedblock(p::EmbedVocabulary, block::Tokens)
+    return NumberVector()
+end
+
+function encode(p::EmbedVocabulary, context, block::Tokens, obs)
+    vocabulary = p.vocab
+
+    return [vocabulary[token] for token in obs]
+end
+
+
+# ## Tests
+
+@testset "TextPreprocessing [Encoding]" begin
+    sample_input = "Unsanintized text, this has to be sanitized. Then it should be tokenized. Finally it has to be numericalized"
+    block = Paragraph()
+    enc1 = Sanitize()
+    testencoding(enc1, block, sample_input)
+
+    # sample_input_sanitized = "xxbos xxmaj unsanintized text sanitized xxmaj tokenized xxmaj finally numericalized"
+    sample_input_sanitized = encode(enc1, Training(), block, sample_input)
+    block = Paragraph()
+    enc2 = Tokenize()
+    testencoding(enc2, block, sample_input_sanitized)
+
+    # tokenized_input = ["xxbos", "xxmaj", "unsanintized", "text", "sanitized", "tokenized", "finally", "numericalized"]
+    tokenized_input = encode(enc2, Training(), block, sample_input_sanitized)
+    block = Tokens()
+    vocab = setup(EmbedVocabulary, [[sample_input]])
+    enc3 = EmbedVocabulary(vocab = vocab.vocab)
+    testencoding(enc3, block, tokenized_input)
+
+
+end
diff --git a/FastText/src/recipes.jl b/FastText/src/recipes.jl
@@ -37,7 +37,7 @@ end
 # Registering recipes
 
 const RECIPES = Dict{String, Vector}("imdb" => [
-                                         TextFolders(filefilterfn = f -> !occursin(r"tmp_clas|tmp_lm|unsup",
+                                         TextFolders(filefilterfn = f -> !occursin(r"tmp_clas|tmp_lm|unsup|test",
                                                                                    f)),
                                      ])
 

diff --git a/FastText/src/tasks/classification.jl b/FastText/src/tasks/classification.jl
@@ -0,0 +1,53 @@
+"""
+    TextClassificationSingle(blocks[, data])
+
+Learning task for single-label text classification. Samples are
+preprocessed by applying various textual transforms and classified into one of `classes`.
+
+"""
+function TextClassificationSingle(blocks::Tuple{<:Paragraph,<:Label}, data)
+    return SupervisedTask(
+        blocks,
+        (
+            Sanitize(),
+            Tokenize(),
+            setup(EmbedVocabulary, data),
+            # EmbedVocabulary(),
+            OneHot()
+        )
+    )
+end
+
+_tasks["textclfsingle"] = (
+    id="textual/textclfsingle",
+    name="Text classification (single-label)",
+    constructor=TextClassificationSingle,
+    blocks=(Paragraph, Label),
+    category="supervised",
+    description="""
+      Single-label text classification task where every text has a single
+      class label associated with it.
+      """,
+    package=@__MODULE__,
+)
+
+# ## Tests
+
+@testset "TextClassificationSingle [task]" begin
+    task = TextClassificationSingle((Paragraph(), Label{String}(["neg", "pos"])), [("A good review", "pos")])
+    testencoding(getencodings(task), getblocks(task).sample, ("A good review", "pos"))
+    FastAI.checktask_core(task, sample = ("A good review", "pos"))
+
+    @testset "`encodeinput`" begin
+        paragraph = "A good review"
+
+        xtrain = encodeinput(task, Training(), paragraph)
+        @test eltype(xtrain) == Int64
+    end
+
+    @testset "`encodetarget`" begin
+        category = "pos"
+        y = encodetarget(task, Training(), category)
+        @test y ≈ [0, 1]
+    end
+end
diff --git a/FastText/src/transform.jl b/FastText/src/transform.jl
@@ -21,13 +21,46 @@ end
 
 convert_lowercase(t) = string("xxbos ", lowercase(t))
 
+function remove_punctuations(t)
+    return replace(t, r"[^\w\s]+" => " ")
+end
+
+function basic_preprocessing(t)
+    doc = StringDocument(t)
+    prepare!(doc, strip_stopwords)
+    prepare!(doc, strip_html_tags)
+    prepare!(doc, strip_non_letters)
+    prepare!(doc, strip_numbers)
+    return text(doc)
+
+end
+
+function remove_extraspaces(t)
+    return replace(t, r"\s+" => " ")
+end
+
+function tokenize(t)
+    urls(ts) = nltk_url1(ts) || nltk_url2(ts)
+
+    ts = TokenBuffer(t)
+    while !isdone(ts)
+        spaces(ts) && continue
+        urls(ts) ||
+            nltk_phonenumbers(ts) ||
+            character(ts)
+    end
+    return ts.tokens
+end
+
 ## Tests
 
 @testset "Text Transforms" begin
-    str1 = "Hello WORLD CAPITAL Sentence Case"
+    str1 = "Hello WORLD CAPITAL Sentence    Case."
 
-    @test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence Case"
-    @test replace_sentence_case(str1) ==
-          "xxmaj hello WORLD CAPITAL xxmaj sentence xxmaj case"
-    @test convert_lowercase(str1) == "xxbos hello world capital sentence case"
+    @test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence    Case."
+    @test replace_sentence_case(str1) == "xxmaj hello WORLD CAPITAL xxmaj sentence    xxmaj case."
+    @test convert_lowercase(str1) == "xxbos hello world capital sentence    case."
+    @test remove_punctuations(str1) == "Hello WORLD CAPITAL Sentence    Case "
+    @test remove_extraspaces(str1) == "Hello WORLD CAPITAL Sentence Case."
+    @test tokenize(str1) == ["Hello", "WORLD", "CAPITAL", "Sentence", "Case."]
 end
diff --git a/Project.toml b/Project.toml
@@ -22,6 +22,7 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
 
 [compat]