Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DOC] Update docs #150

Merged
merged 5 commits into from
Aug 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,39 @@ def __call__(self, items, *args, **kwargs):
return self.infer_vector(items, *args, **kwargs)

def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list:
# """tokenize item"""
"""
tokenize item
Parameter
----------
items: a list of questions
Return
----------
tokens: list
"""
return self.tokenizer(items, *args, key=key, **kwargs)

def infer_vector(self, items, key=lambda x: x, **kwargs) -> tuple:
"""
get question embedding
NotImplemented
"""
raise NotImplementedError

def infer_item_vector(self, tokens, *args, **kwargs) -> ...:
"""NotImplemented"""
return self.infer_vector(tokens, *args, **kwargs)[0]

def infer_token_vector(self, tokens, *args, **kwargs) -> ...:
"""NotImplemented"""
return self.infer_vector(tokens, *args, **kwargs)[1]

def save(self, config_path):
"""
save model weights in config_path
Parameter:
----------
config_path: str
"""
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.params, wf, ensure_ascii=False, indent=2)

Expand All @@ -126,6 +146,7 @@ def load(cls, config_path, *args, **kwargs):

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
"""NotImplemented"""
raise NotImplementedError

@property
Expand Down
10 changes: 10 additions & 0 deletions EduNLP/Vector/disenqnet/disenqnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ def infer_vector(self, items: dict, vector_type=None, **kwargs) -> torch.Tensor:

def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor:
embeded, _, _ = self(items)
"""
get tokens embedding with DisenQModel
Parameters
----------
items: dict
{'content_idx': tensor(),'content_len': tensor()}, the tokens about question after tokenizer processing

Returns:
torch.Tensor: token embedding
"""
return embeded

@property
Expand Down
18 changes: 18 additions & 0 deletions EduNLP/Vector/elmo_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@ def __call__(self, items: dict):
return outputs

def infer_vector(self, items: dict, **kwargs) -> torch.Tensor:
"""
get sentence vector embedding with ElmoModel
Parameters
----------
items: dict, {'seq_idx': tensor(),'seq_len':tensor()}, the tokens about question after tokenizer processing

Returns:
torch.Tensor: sentence embedding
"""
outputs = self(items)
item_embeds = torch.cat(
(outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],
Expand All @@ -29,6 +38,15 @@ def infer_vector(self, items: dict, **kwargs) -> torch.Tensor:
return item_embeds

def infer_tokens(self, items, **kwargs) -> torch.Tensor:
"""
get tokens embedding with ElmoModel
Parameters
----------
items: dict, {'seq_idx': tensor()}, the tokens about question after tokenizer processing

Returns:
torch.Tensor: token embedding
"""
outputs = self(items)
forward_hiddens = outputs.forward_output
backward_hiddens = outputs.backward_output
Expand Down
58 changes: 58 additions & 0 deletions EduNLP/Vector/gensim_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,32 @@ def __getitem__(self, item):
return self.wv[item] if index not in self.constants.values() else np.zeros((self.vector_size,))

def infer_vector(self, items, agg="mean", **kwargs) -> list:
"""
get sentence embedding with word2vec model
Parameters
----------
item: list, the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
token_vectors = self.infer_tokens(items, **kwargs)
# return [eval("np.%s" % agg)(item, axis=0) if item else np.array([]) for item in token_vectors]
return [eval("np.%s" % agg)(item, axis=0) if item else np.zeros(self.vector_size,) for item in token_vectors]

def infer_tokens(self, items, **kwargs) -> list:
"""
get token embedding with word2vec model
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[[array(), ..., array()], [...], [...]]
"""
return [list(self(*item)) for item in items]


Expand All @@ -95,6 +116,17 @@ def __init__(self, filepath):
self.dictionary = corpora.Dictionary.load(filepath)

def infer_vector(self, item, return_vec=False):
"""
get Bow vector
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
item = self.dictionary.doc2bow(item)
if not return_vec:
return item # return dic as default
Expand All @@ -121,6 +153,17 @@ def __init__(self, filepath):
self.dictionary = corpora.Dictionary.load(dictionary_path)

def infer_vector(self, item, return_vec=False):
"""
get Tf-idf vector
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
dic_item = self.dictionary.doc2bow(item)
tfidf_item = self.tfidf_model[dic_item]
# return dic as default
Expand Down Expand Up @@ -181,7 +224,22 @@ def vector_size(self):
return self.d2v.vector_size

def infer_vector(self, items, *args, **kwargs) -> list:
"""
get vector with D2V model
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
return [self(item) for item in items]

def infer_tokens(self, item, *args, **kwargs) -> ...:
"""
get token embeddings with D2V
NotImplemented
"""
raise NotImplementedError
43 changes: 42 additions & 1 deletion EduNLP/Vector/t2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class T2V(object):

Examples
--------
>>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\
>>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$, \
... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}]
>>> model_dir = "examples/test_model/d2v"
>>> url, model_name, *args = get_pretrained_model_info('d2v_test_256')
Expand All @@ -69,9 +69,24 @@ def __call__(self, items, *args, **kwargs):
return self.i2v.infer_vector(items, *args, **kwargs)

def infer_vector(self, items, *args, **kwargs):
"""
get question embedding with T2V
Parameters
----------
items:list
a list of question
Returns
-------
vector:list
numpy.ndarray([dtype=float32)]
"""
return self.i2v.infer_vector(items, *args, **kwargs)

def infer_tokens(self, items, *args, **kwargs):
"""
get token embeddings with T2V
NotImplemented
"""
return self.i2v.infer_tokens(items, *args, **kwargs)

@property
Expand All @@ -80,6 +95,24 @@ def vector_size(self) -> int:


def get_pretrained_model_info(name):
"""
get the pretrained model information with the given name
Parameters
----------
name:str
select the pretrained model
e.g.:
d2v_math_300
w2v_math_300
elmo_math_2048
bert_math_768
bert_taledu_768
disenq_math_256
quesnet_math_512
Returns
--------
list: [model url (where to download), model name]
"""
url = MODELHUB_URL + 'getPretrainedModel'
param = {'name': name}
r = requests.get(url, params=param)
Expand All @@ -89,6 +122,14 @@ def get_pretrained_model_info(name):


def get_all_pretrained_models():
"""
get all pretrained models' name

Returns
-------
the pretrained models' name:list
e.g.['bert_bio_ptc', 'bert_geo_ptc', 'bert_math_768', ... ]
"""
url = MODELHUB_URL + 'getPretrainedModelList'
r = requests.get(url)
assert r.status_code == 200, r.status_code
Expand Down
7 changes: 6 additions & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,31 @@ EduNLP document and tutorial folder

Requirements
------------

See the requirements `docs_deps` in `setup.py`:

```sh
pip install -e .[doc]
```


Build documents
---------------

First, clean up existing files:

```
make clean
```

Then build:

```
make html
```

Render locally
--------------

```
cd build/html
python3 -m http.server 8000
Expand Down
4 changes: 2 additions & 2 deletions docs/source/api/vector.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ EduNLP.Vector.t2v


EduNLP.Vector.disenqnet
--------------------
-------------------------

.. automodule:: EduNLP.Vector.disenqnet.disenqnet
:members:

EduNLP.Vector.quesnet
--------------------
-------------------------

.. automodule:: EduNLP.Vector.quesnet.quesnet
:members:
Expand Down
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,5 @@ def copy_tree(src, tar):
'undoc-members': True,
}
autodoc_member_order = 'bysource'

nbsphinx_allow_errors = True
8 changes: 3 additions & 5 deletions docs/source/tutorial/zh/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
流水线
=======

.. nbgallery::
:caption: This is a thumbnail gallery:
:name: pipleine_gallery
:glob:
.. nbinfo::
notebook:

流水线 <../../build/blitz/pipeline/pipeline.ipynb>
`流水线 <../../build/blitz/pipeline/pipeline.ipynb>`_
Loading
Loading