Skip to content

Commit

Permalink
Text classification task (#245)
Browse files Browse the repository at this point in the history
* Add `TextClassificationTask`

* Add tokenization and tests for new transforms.

* Add notebook for data pipeline

Co-authored-by: lorenzoh <lorenz.ohly@gmail.com>
  • Loading branch information
Chandu-4444 and lorenzoh committed Jul 18, 2022
1 parent dabd150 commit 2d396e2
Show file tree
Hide file tree
Showing 10 changed files with 590 additions and 12 deletions.
3 changes: 3 additions & 0 deletions FastText/Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@ authors = ["Lorenz Ohly", "FluxML Community"]
version = "0.1.0"

[deps]
DataStructures = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
FastAI = "5d0beca9-ade8-49ae-ad0b-a3cf890e669f"
InlineTest = "bd334432-b1e7-49c7-a2dc-dd9149e4ebd6"
MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
WordTokenizers = "796a5d58-b03d-544a-977e-18100b691f6e"

[compat]
FastAI = "0.5"
Expand Down
38 changes: 37 additions & 1 deletion FastText/src/FastText.jl
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,52 @@ using FastAI:

using FastAI.Datasets

using ..FastAI: testencoding

# extending
import ..FastAI:
blockmodel, blockbackbone, blocklossfn, encode, decode, checkblock,
encodedblock, decodedblock, showblock!, mockblock, setup, encodestate,
decodestate

using InlineTest
using Random
using TextAnalysis:
StringDocument, prepare!, strip_stopwords, text,
strip_html_tags, strip_non_letters, strip_numbers
using DataStructures: OrderedDict

using WordTokenizers: TokenBuffer, isdone, character, spaces, nltk_url1, nltk_url2, nltk_phonenumbers


include("recipes.jl")
include("blocks/text.jl")
include("transform.jl")
include("encodings/textpreprocessing.jl")

const _tasks = Dict{String,Any}()
include("tasks/classification.jl")

const DEFAULT_SANITIZERS = [
replace_all_caps,
replace_sentence_case,
convert_lowercase,
remove_punctuations,
basic_preprocessing,
remove_extraspaces
]

const DEFAULT_TOKENIZERS = [tokenize]

function __init__()
FastAI.Registries.registerrecipes(@__MODULE__, RECIPES)
foreach(values(_tasks)) do t
if !haskey(FastAI.learningtasks(), t.id)
push!(FastAI.learningtasks(), t)
end
end
end

export Paragraph
export Paragraph, TextClassificationSingle, Sanitize, Tokenize

end
15 changes: 10 additions & 5 deletions FastText/src/blocks/text.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Paragraph() <: Block
[`Block`](#) for a text paragraph containing one or more
sentences (basically, a single observation in the textual dataset).
sentences (basically, a single observation in the textual dataset).
`data` is valid for `Paragraph` if it is of type string.
Example valid Paragraphs:
Expand All @@ -26,7 +26,12 @@ FastAI.mockblock(Paragraph())
struct Paragraph <: Block end

FastAI.checkblock(::Paragraph, ::String) = true
function FastAI.mockblock(::Paragraph)
randstring(" ABCEEFGHIJKLMNOPQESRUVWXYZ 1234567890 abcdefghijklmnopqrstynwxyz\n\t.,",
rand(10:40))
end
FastAI.mockblock(::Paragraph) = randstring(" ABCEEFGHIJKLMNOPQESRUVWXYZ 1234567890 abcdefghijklmnopqrstynwxyz\n\t.,", rand(10:40))

struct Tokens <: Block end

FastAI.checkblock(::Tokens, ::Vector{String}) = true

struct NumberVector <: Block end

FastAI.checkblock(::NumberVector, ::Vector{Int64}) = true
108 changes: 108 additions & 0 deletions FastText/src/encodings/textpreprocessing.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
"""
TextEncoding() <: Encoding
Encodes `Paragraph`s by applying various textual transforms.
Encodes
- `Paragraph` -> `Paragraph`
"""
struct Sanitize <: Encoding
tfms
end

Sanitize() = Sanitize(DEFAULT_SANITIZERS)


encodedblock(::Sanitize, block::Paragraph) = block

function encode(p::Sanitize, context, block::Paragraph, obs)
for tfm in values(p.tfms)
obs = tfm(obs)
end
obs
end

struct Tokenize <: Encoding
tfms
end

Tokenize() = Tokenize(DEFAULT_TOKENIZERS)

function encodedblock(p::Tokenize, block::Paragraph)
return Tokens()
end

function encode(p::Tokenize, context, block::Paragraph, obs)
for tfm in values(p.tfms)
obs = tfm(obs)
end
obs
end

function computevocabulary(data)
lookup_table = Dict{String, Int}()

enc1 = Sanitize()
sanitized_Data = map(i -> encode(enc1, Training(), Paragraph(), getobs(data, i)[1]), 1:numobs(data))

enc2 = Tokenize()
tokenized_data = map(i -> encode(enc2, Training(), Paragraph(), getobs(sanitized_Data, i)), 1:numobs(data))

vocab = []
for sample in tokenized_data
for token in sample
lookup_table[token] = get(lookup_table, token, 0) + 1
end
end
return OrderedDict(lookup_table)
end

struct EmbedVocabulary <: Encoding
vocab
end

function EmbedVocabulary(; vocab)
return EmbedVocabulary(vocab)
end

function setup(::Type{EmbedVocabulary}, data)
vocab = computevocabulary(data)
return EmbedVocabulary(vocab = vocab)
end

function encodedblock(p::EmbedVocabulary, block::Tokens)
return NumberVector()
end

function encode(p::EmbedVocabulary, context, block::Tokens, obs)
vocabulary = p.vocab

return [vocabulary[token] for token in obs]
end


# ## Tests

@testset "TextPreprocessing [Encoding]" begin
sample_input = "Unsanintized text, this has to be sanitized. Then it should be tokenized. Finally it has to be numericalized"
block = Paragraph()
enc1 = Sanitize()
testencoding(enc1, block, sample_input)

# sample_input_sanitized = "xxbos xxmaj unsanintized text sanitized xxmaj tokenized xxmaj finally numericalized"
sample_input_sanitized = encode(enc1, Training(), block, sample_input)
block = Paragraph()
enc2 = Tokenize()
testencoding(enc2, block, sample_input_sanitized)

# tokenized_input = ["xxbos", "xxmaj", "unsanintized", "text", "sanitized", "tokenized", "finally", "numericalized"]
tokenized_input = encode(enc2, Training(), block, sample_input_sanitized)
block = Tokens()
vocab = setup(EmbedVocabulary, [[sample_input]])
enc3 = EmbedVocabulary(vocab = vocab.vocab)
testencoding(enc3, block, tokenized_input)


end
2 changes: 1 addition & 1 deletion FastText/src/recipes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ end
# Registering recipes

const RECIPES = Dict{String, Vector}("imdb" => [
TextFolders(filefilterfn = f -> !occursin(r"tmp_clas|tmp_lm|unsup",
TextFolders(filefilterfn = f -> !occursin(r"tmp_clas|tmp_lm|unsup|test",
f)),
])

Expand Down
53 changes: 53 additions & 0 deletions FastText/src/tasks/classification.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
TextClassificationSingle(blocks[, data])
Learning task for single-label text classification. Samples are
preprocessed by applying various textual transforms and classified into one of `classes`.
"""
function TextClassificationSingle(blocks::Tuple{<:Paragraph,<:Label}, data)
return SupervisedTask(
blocks,
(
Sanitize(),
Tokenize(),
setup(EmbedVocabulary, data),
# EmbedVocabulary(),
OneHot()
)
)
end

_tasks["textclfsingle"] = (
id="textual/textclfsingle",
name="Text classification (single-label)",
constructor=TextClassificationSingle,
blocks=(Paragraph, Label),
category="supervised",
description="""
Single-label text classification task where every text has a single
class label associated with it.
""",
package=@__MODULE__,
)

# ## Tests

@testset "TextClassificationSingle [task]" begin
task = TextClassificationSingle((Paragraph(), Label{String}(["neg", "pos"])), [("A good review", "pos")])
testencoding(getencodings(task), getblocks(task).sample, ("A good review", "pos"))
FastAI.checktask_core(task, sample = ("A good review", "pos"))

@testset "`encodeinput`" begin
paragraph = "A good review"

xtrain = encodeinput(task, Training(), paragraph)
@test eltype(xtrain) == Int64
end

@testset "`encodetarget`" begin
category = "pos"
y = encodetarget(task, Training(), category)
@test y [0, 1]
end
end
43 changes: 38 additions & 5 deletions FastText/src/transform.jl
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,46 @@ end

convert_lowercase(t) = string("xxbos ", lowercase(t))

function remove_punctuations(t)
return replace(t, r"[^\w\s]+" => " ")
end

function basic_preprocessing(t)
doc = StringDocument(t)
prepare!(doc, strip_stopwords)
prepare!(doc, strip_html_tags)
prepare!(doc, strip_non_letters)
prepare!(doc, strip_numbers)
return text(doc)

end

function remove_extraspaces(t)
return replace(t, r"\s+" => " ")
end

function tokenize(t)
urls(ts) = nltk_url1(ts) || nltk_url2(ts)

ts = TokenBuffer(t)
while !isdone(ts)
spaces(ts) && continue
urls(ts) ||
nltk_phonenumbers(ts) ||
character(ts)
end
return ts.tokens
end

## Tests

@testset "Text Transforms" begin
str1 = "Hello WORLD CAPITAL Sentence Case"
str1 = "Hello WORLD CAPITAL Sentence Case."

@test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence Case"
@test replace_sentence_case(str1) ==
"xxmaj hello WORLD CAPITAL xxmaj sentence xxmaj case"
@test convert_lowercase(str1) == "xxbos hello world capital sentence case"
@test replace_all_caps(str1) == "Hello xxup world xxup capital Sentence Case."
@test replace_sentence_case(str1) == "xxmaj hello WORLD CAPITAL xxmaj sentence xxmaj case."
@test convert_lowercase(str1) == "xxbos hello world capital sentence case."
@test remove_punctuations(str1) == "Hello WORLD CAPITAL Sentence Case "
@test remove_extraspaces(str1) == "Hello WORLD CAPITAL Sentence Case."
@test tokenize(str1) == ["Hello", "WORLD", "CAPITAL", "Sentence", "Case."]
end
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
TextAnalysis = "a2db99b7-8b79-58f8-94bf-bbc811eef33d"
UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"

[compat]
Expand Down
Loading

0 comments on commit 2d396e2

Please sign in to comment.