-
-
Notifications
You must be signed in to change notification settings - Fork 51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Text classification task #245
Changes from 9 commits
03971dc
f4ccd98
8e145f8
d4ce968
0db0336
8d5c7ec
9181089
3c50f87
ef92e98
6a53765
b61f255
513c7df
5304472
00bf770
485e73c
6d63b57
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
""" | ||
TextEncoding() <: Encoding | ||
|
||
Encodes `Paragraph`s by applying various textual transforms. | ||
|
||
|
||
Encodes | ||
- `Paragraph` -> `Paragraph` | ||
|
||
""" | ||
struct Sanitize <: Encoding | ||
tfms | ||
end | ||
|
||
Sanitize() = Sanitize(DEFAULT_SANITIZERS) | ||
|
||
|
||
encodedblock(::Sanitize, block::Paragraph) = block | ||
|
||
function encode(p::Sanitize, context, block::Paragraph, obs) | ||
for tfm in values(p.tfms) | ||
obs = tfm(obs) | ||
end | ||
obs | ||
end | ||
|
||
struct Tokenize <: Encoding | ||
tfms | ||
end | ||
|
||
Tokenize() = Tokenize(DEFAULT_TOKENIZERS) | ||
|
||
function encodedblock(p::Tokenize, block::Paragraph) | ||
return Tokens() | ||
end | ||
|
||
function encode(p::Tokenize, context, block::Paragraph, obs) | ||
for tfm in values(p.tfms) | ||
obs = tfm(obs) | ||
end | ||
obs | ||
end | ||
|
||
function computevocabulary(data) | ||
lookup_table = Dict{String, Int}() | ||
|
||
enc1 = Sanitize() | ||
sanitized_Data = map(i -> encode(enc1, Training(), Paragraph(), getobs(data, i)[1]), 1:numobs(data)) | ||
|
||
enc2 = Tokenize() | ||
tokenized_data = map(i -> encode(enc2, Training(), Paragraph(), getobs(sanitized_Data, i)), 1:numobs(data)) | ||
|
||
vocab = [] | ||
for sample in tokenized_data | ||
for token in sample | ||
lookup_table[token] = get(lookup_table, token, 0) + 1 | ||
end | ||
end | ||
return OrderedDict(lookup_table) | ||
end | ||
|
||
struct EmbedVocabulary <: Encoding | ||
vocab | ||
end | ||
|
||
function EmbedVocabulary(; vocab) | ||
return EmbedVocabulary(vocab) | ||
end | ||
|
||
function setup(::Type{EmbedVocabulary}, data) | ||
vocab = computevocabulary(data) | ||
return EmbedVocabulary(vocab = vocab) | ||
end | ||
|
||
function encodedblock(p::EmbedVocabulary, block::Tokens) | ||
return NumberVector() | ||
end | ||
|
||
function encode(p::EmbedVocabulary, context, block::Tokens, obs) | ||
vocabulary = p.vocab | ||
|
||
return [vocabulary[token] for token in obs] | ||
end | ||
|
||
|
||
# ## Tests | ||
|
||
@testset "TextPreprocessing [Encoding]" begin | ||
sample_input = "Unsanintized text, this has to be sanitized. Then it should be tokenized. Finally it has to be numericalized" | ||
block = Paragraph() | ||
enc1 = Sanitize() | ||
testencoding(enc1, block, sample_input) | ||
|
||
# sample_input_sanitized = "xxbos xxmaj unsanintized text sanitized xxmaj tokenized xxmaj finally numericalized" | ||
sample_input_sanitized = encode(enc1, Training(), block, sample_input) | ||
block = Paragraph() | ||
enc2 = Tokenize() | ||
testencoding(enc2, block, sample_input_sanitized) | ||
|
||
# tokenized_input = ["xxbos", "xxmaj", "unsanintized", "text", "sanitized", "tokenized", "finally", "numericalized"] | ||
tokenized_input = encode(enc2, Training(), block, sample_input_sanitized) | ||
block = Tokens() | ||
vocab = setup(EmbedVocabulary, [[sample_input]]) | ||
enc3 = EmbedVocabulary(vocab = vocab.vocab) | ||
testencoding(enc3, block, tokenized_input) | ||
|
||
|
||
end |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
""" | ||
TextClassificationSingle(blocks[, data]) | ||
|
||
Learning task for single-label text classification. Samples are | ||
preprocessed by applying various textual transforms and classified into one of `classes`. | ||
|
||
""" | ||
function TextClassificationSingle(blocks::Tuple{<:Paragraph,<:Label}, data) | ||
return SupervisedTask( | ||
blocks, | ||
( | ||
Sanitize(), | ||
Tokenize(), | ||
setup(EmbedVocabulary, data), | ||
# EmbedVocabulary(), | ||
OneHot() | ||
) | ||
) | ||
end | ||
|
||
_tasks["textclfsingle"] = ( | ||
id="textual/textclfsingle", | ||
name="Text classification (single-label)", | ||
constructor=TextClassificationSingle, | ||
blocks=(Paragraph, Label), | ||
category="supervised", | ||
description=""" | ||
Single-label text classification task where every text has a single | ||
class label associated with it. | ||
""", | ||
package=@__MODULE__, | ||
) | ||
|
||
# ## Tests | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do these work? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I checked it and it's working fine. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then I would leave them uncommented so CI can do its work :) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure! now that I've changed the structure of |
||
|
||
@testset "TextClassificationSingle [task]" begin | ||
task = TextClassificationSingle((Paragraph(), Label{String}(["neg", "pos"])), [("A good review", "pos")]) | ||
testencoding(getencodings(task), getblocks(task).sample, ("A good review", "pos")) | ||
FastAI.checktask_core(task, sample = ("A good review", "pos")) | ||
|
||
@testset "`encodeinput`" begin | ||
paragraph = "A good review" | ||
|
||
xtrain = encodeinput(task, Training(), paragraph) | ||
@test eltype(xtrain) == Int64 | ||
end | ||
|
||
@testset "`encodetarget`" begin | ||
category = "pos" | ||
y = encodetarget(task, Training(), category) | ||
@test y ≈ [0, 1] | ||
end | ||
end |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is
tfms
always a Vector? This can probably be constrained further.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm.. Not always a vector and it can be empty too. But if one transform is used that'll probably be used along with a couple of other transforms like remove_puncts, stripping case etc.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What can it be other than a vector? Can you enumerate the possible types?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I mean it can either be empty (empty vector?) or a vector. For example, in the case of text generation, we don't have any sanitization for the input data and here sanitization step can be skipped. But yes! it'll always be a vector then.