FluxML · lorenzoh · Jul 18, 2022 · Jun 21, 2022 · Jun 21, 2022 · Jun 24, 2022
diff --git a/Project.toml b/Project.toml
@@ -10,6 +10,7 @@ Colors = "5ae59095-9a9b-59fe-a467-6f913c188581"
 DataAugmentation = "88a5189c-e7ff-4f85-ac6b-e6158070f02e"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
 FeatureRegistries = "c6aefb4f-3ac3-4095-8805-528476b02c02"
 FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 FilePathsBase = "48062228-2e41-5def-b9a4-89aafe57970f"
@@ -37,7 +38,9 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
+WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [compat]

diff --git a/src/FastAI.jl b/src/FastAI.jl
@@ -24,10 +24,18 @@ using StaticArrays
 using Setfield
 using ShowCases
 using Tables
+using TextAnalysis:
+    StringDocument, prepare!, strip_stopwords,
+    strip_html_tags, strip_non_letters, strip_numbers
 import Test
 import UnicodePlots
 using Statistics
 using InlineTest
+using DataStructures: OrderedDict
+
+using WordTokenizers: TokenBuffer, isdone, character, spaces, nltk_url1, nltk_url2, nltk_phonenumbers
+
+
 
 
 # ## Learning task API (previously DLPipelines.jl)

diff --git a/src/Textual/Textual.jl b/src/Textual/Textual.jl
@@ -17,18 +17,56 @@ using ..FastAI.Datasets
 
 using ..FastAI.Datasets
 
+# for tests
+using ..FastAI: testencoding
+
+# extending
+import ..FastAI:
+    blockmodel, blockbackbone, blocklossfn, encode, decode, checkblock,
+    encodedblock, decodedblock, showblock!, mockblock, setup, encodestate,
+    decodestate
+
+
+
 import Requires: @require
 
 using InlineTest
 using Random
+using TextAnalysis:
+    StringDocument, prepare!, strip_stopwords, text,
+    strip_html_tags, strip_non_letters, strip_numbers
+using DataStructures: OrderedDict
+
+using WordTokenizers: TokenBuffer, isdone, character, spaces, nltk_url1, nltk_url2, nltk_phonenumbers
 
 include("recipes.jl")
 include("blocks/text.jl")
 include("transform.jl")
+include("encodings/textpreprocessing.jl")
+
+const _tasks = Dict{String,Any}()
+include("tasks/classification.jl")
+
+const DEFAULT_SANITIZERS = [
+    replace_all_caps,
+    replace_sentence_case,
+    convert_lowercase,
+    remove_punctuations,
+    basic_preprocessing,
+    remove_extraspaces
+]
+
+const DEFAULT_TOKENIZERS = [tokenize]
+
 
 function __init__()
     _registerrecipes()
+    foreach(values(_tasks)) do t
+        if !haskey(FastAI.learningtasks(), t.id)
+            push!(FastAI.learningtasks(), t)
+        end
+    end
 end
 
-export Paragraph
+export Paragraph, TextClassificationSingle, Sanitize, Tokenize
 end
diff --git a/src/Textual/blocks/text.jl b/src/Textual/blocks/text.jl
@@ -2,7 +2,7 @@
     Paragraph() <: Block
 
 [`Block`](#) for a text paragraph containing one or more
-sentences (basically, a single observation in the textual dataset). 
+sentences (basically, a single observation in the textual dataset).
 `data` is valid for `Paragraph` if it is of type string.
 
 Example valid Paragraphs:
@@ -27,3 +27,11 @@ struct Paragraph <: Block end
 
 FastAI.checkblock(::Paragraph, ::String) = true
 FastAI.mockblock(::Paragraph) = randstring(" ABCEEFGHIJKLMNOPQESRUVWXYZ 1234567890 abcdefghijklmnopqrstynwxyz\n\t.,", rand(10:40))
+
+struct Tokens <: Block end
+
+FastAI.checkblock(::Tokens, ::Vector{String}) = true
+
+struct NumberVector <: Block end
+
+FastAI.checkblock(::NumberVector, ::Vector{Int64}) = true
diff --git a/src/Textual/encodings/textpreprocessing.jl b/src/Textual/encodings/textpreprocessing.jl
@@ -0,0 +1,108 @@
+"""
+    TextEncoding() <: Encoding
+
+Encodes `Paragraph`s by applying various textual transforms.
+
+
+Encodes
+- `Paragraph` -> `Paragraph`
+
+"""
+struct Sanitize <: Encoding
+    tfms
+end
+
+Sanitize() = Sanitize(DEFAULT_SANITIZERS)
+
+
+encodedblock(::Sanitize, block::Paragraph) = block
+
+function encode(p::Sanitize, context, block::Paragraph, obs)
+    for tfm in values(p.tfms)
+        obs = tfm(obs)
+    end
+    obs
+end
+
+struct Tokenize <: Encoding
+    tfms
+end
+
+Tokenize() = Tokenize(DEFAULT_TOKENIZERS)
+
+function encodedblock(p::Tokenize, block::Paragraph)
+    return Tokens()
+end
+
+function encode(p::Tokenize, context, block::Paragraph, obs)
+    for tfm in values(p.tfms)
+        obs = tfm(obs)
+    end
+    obs
+end
+
+function computevocabulary(data)
+    lookup_table = Dict{String, Int}()
+
+    enc1 = Sanitize()
+    sanitized_Data = map(i -> encode(enc1, Training(), Paragraph(), getobs(data, i)[1]), 1:numobs(data))
+
+    enc2 = Tokenize()
+    tokenized_data = map(i -> encode(enc2, Training(), Paragraph(), getobs(sanitized_Data, i)), 1:numobs(data))
+
+    vocab = []
+    for sample in tokenized_data
+        for token in sample
+            lookup_table[token] = get(lookup_table, token, 0) + 1
+        end
+    end
+    return OrderedDict(lookup_table)
+end
+
+struct EmbedVocabulary <: Encoding
+    vocab
+end
+
+function EmbedVocabulary(; vocab)
+    return EmbedVocabulary(vocab)
+end
+
+function setup(::Type{EmbedVocabulary}, data)
+    vocab = computevocabulary(data)
+    return EmbedVocabulary(vocab = vocab)
+end
+
+function encodedblock(p::EmbedVocabulary, block::Tokens)
+    return NumberVector()
+end
+
+function encode(p::EmbedVocabulary, context, block::Tokens, obs)
+    vocabulary = p.vocab
+
+    return [vocabulary[token] for token in obs]
+end
+
+
+# ## Tests
+
+@testset "TextPreprocessing [Encoding]" begin
+    sample_input = "Unsanintized text, this has to be sanitized. Then it should be tokenized. Finally it has to be numericalized"
+    block = Paragraph()
+    enc1 = Sanitize()
+    testencoding(enc1, block, sample_input)
+
+    # sample_input_sanitized = "xxbos xxmaj unsanintized text sanitized xxmaj tokenized xxmaj finally numericalized"
+    sample_input_sanitized = encode(enc1, Training(), block, sample_input)
+    block = Paragraph()
+    enc2 = Tokenize()
+    testencoding(enc2, block, sample_input_sanitized)
+
+    # tokenized_input = ["xxbos", "xxmaj", "unsanintized", "text", "sanitized", "tokenized", "finally", "numericalized"]
+    tokenized_input = encode(enc2, Training(), block, sample_input_sanitized)
+    block = Tokens()
+    vocab = setup(EmbedVocabulary, [[sample_input]])
+    enc3 = EmbedVocabulary(vocab = vocab.vocab)
+    testencoding(enc3, block, tokenized_input)
+
+
+end
diff --git a/src/Textual/recipes.jl b/src/Textual/recipes.jl
@@ -32,7 +32,7 @@ end
 
 const RECIPES = Dict{String,Vector{Datasets.DatasetRecipe}}(
     "imdb" => [TextFolders(
-        filefilterfn=f -> !occursin(r"tmp_clas|tmp_lm|unsup", f)
+        filefilterfn=f -> !occursin(r"tmp_clas|tmp_lm|unsup|test", f)
     )],
 )
 

diff --git a/src/Textual/tasks/classification.jl b/src/Textual/tasks/classification.jl
@@ -0,0 +1,53 @@
+"""
+    TextClassificationSingle(blocks[, data])
+
+Learning task for single-label text classification. Samples are
+preprocessed by applying various textual transforms and classified into one of `classes`.
+
+"""
+function TextClassificationSingle(blocks::Tuple{<:Paragraph,<:Label}, data)
+    return SupervisedTask(
+        blocks,
+        (
+            Sanitize(),
+            Tokenize(),
+            setup(EmbedVocabulary, data),
+            # EmbedVocabulary(),
+            OneHot()
+        )
+    )
+end
+
+_tasks["textclfsingle"] = (
+    id="textual/textclfsingle",
+    name="Text classification (single-label)",
+    constructor=TextClassificationSingle,
+    blocks=(Paragraph, Label),
+    category="supervised",
+    description="""
+      Single-label text classification task where every text has a single
+      class label associated with it.
+      """,
+    package=@__MODULE__,
+)
+
+# ## Tests
+
+@testset "TextClassificationSingle [task]" begin
+    task = TextClassificationSingle((Paragraph(), Label{String}(["neg", "pos"])), [("A good review", "pos")])
+    testencoding(getencodings(task), getblocks(task).sample, ("A good review", "pos"))
+    FastAI.checktask_core(task, sample = ("A good review", "pos"))
+
+    @testset "`encodeinput`" begin
+        paragraph = "A good review"
+
+        xtrain = encodeinput(task, Training(), paragraph)
+        @test eltype(xtrain) == Int64
+    end
+
+    @testset "`encodetarget`" begin
+        category = "pos"
+        y = encodetarget(task, Training(), category)
+        @test y ≈ [0, 1]
+    end
+end
diff --git a/src/Textual/transform.jl b/src/Textual/transform.jl
@@ -21,14 +21,47 @@ end
 
 convert_lowercase(t) = string("xxbos ", lowercase(t))
 
+function remove_punctuations(t)
+    return replace(t, r"[^\w\s]+" => " ")
+end
+
+function basic_preprocessing(t)
+    doc = StringDocument(t)
+    prepare!(doc, strip_stopwords)
+    prepare!(doc, strip_html_tags)
+    prepare!(doc, strip_non_letters)
+    prepare!(doc, strip_numbers)
+    return text(doc)
+
+end
+
+function remove_extraspaces(t)
+    return replace(t, r"\s+" => " ")
+end
+
+function tokenize(t)
+    urls(ts) = nltk_url1(ts) || nltk_url2(ts)
+
+    ts = TokenBuffer(t)
+    while !isdone(ts)
+        spaces(ts) && continue
+        urls(ts) ||
+            nltk_phonenumbers(ts) ||
+            character(ts)
+    end
+    return ts.tokens
+end
 
 ## Tests
 
 
 @testset "Text Transforms" begin
-    str1 = "Hello WORLD CAPITAL Sentence Case"
+    str1 = "Hello WORLD CAPITAL Sentence    Case."
 
-    @test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence Case"
-    @test replace_sentence_case(str1) == "xxmaj hello WORLD CAPITAL xxmaj sentence xxmaj case"
-    @test convert_lowercase(str1) == "xxbos hello world capital sentence case"
+    @test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence    Case."
+    @test replace_sentence_case(str1) == "xxmaj hello WORLD CAPITAL xxmaj sentence    xxmaj case."
+    @test convert_lowercase(str1) == "xxbos hello world capital sentence    case."
+    @test remove_punctuations(str1) == "Hello WORLD CAPITAL Sentence    Case "
+    @test remove_extraspaces(str1) == "Hello WORLD CAPITAL Sentence Case."
+    @test tokenize(str1) == ["Hello", "WORLD", "CAPITAL", "Sentence", "Case."]
 end