Skip to content

Commit

Permalink
Merge pull request #150 from karin0018/dev
Browse files Browse the repository at this point in the history
[DOC] Update docs
  • Loading branch information
nnnyt committed Aug 5, 2023
2 parents 4ba7ec4 + c840248 commit 5f2f71f
Show file tree
Hide file tree
Showing 11 changed files with 328 additions and 77 deletions.
23 changes: 22 additions & 1 deletion EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,39 @@ def __call__(self, items, *args, **kwargs):
return self.infer_vector(items, *args, **kwargs)

def tokenize(self, items, *args, key=lambda x: x, **kwargs) -> list:
# """tokenize item"""
"""
tokenize item
Parameter
----------
items: a list of questions
Return
----------
tokens: list
"""
return self.tokenizer(items, *args, key=key, **kwargs)

def infer_vector(self, items, key=lambda x: x, **kwargs) -> tuple:
"""
get question embedding
NotImplemented
"""
raise NotImplementedError

def infer_item_vector(self, tokens, *args, **kwargs) -> ...:
"""NotImplemented"""
return self.infer_vector(tokens, *args, **kwargs)[0]

def infer_token_vector(self, tokens, *args, **kwargs) -> ...:
"""NotImplemented"""
return self.infer_vector(tokens, *args, **kwargs)[1]

def save(self, config_path):
"""
save model weights in config_path
Parameter:
----------
config_path: str
"""
with open(config_path, "w", encoding="utf-8") as wf:
json.dump(self.params, wf, ensure_ascii=False, indent=2)

Expand All @@ -126,6 +146,7 @@ def load(cls, config_path, *args, **kwargs):

@classmethod
def from_pretrained(cls, name, model_dir=MODEL_DIR, *args, **kwargs):
"""NotImplemented"""
raise NotImplementedError

@property
Expand Down
10 changes: 10 additions & 0 deletions EduNLP/Vector/disenqnet/disenqnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ def infer_vector(self, items: dict, vector_type=None, **kwargs) -> torch.Tensor:

def infer_tokens(self, items: dict, **kwargs) -> torch.Tensor:
embeded, _, _ = self(items)
"""
get tokens embedding with DisenQModel
Parameters
----------
items: dict
{'content_idx': tensor(),'content_len': tensor()}, the tokens about question after tokenizer processing
Returns:
torch.Tensor: token embedding
"""
return embeded

@property
Expand Down
18 changes: 18 additions & 0 deletions EduNLP/Vector/elmo_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,15 @@ def __call__(self, items: dict):
return outputs

def infer_vector(self, items: dict, **kwargs) -> torch.Tensor:
"""
get sentence vector embedding with ElmoModel
Parameters
----------
items: dict, {'seq_idx': tensor(),'seq_len':tensor()}, the tokens about question after tokenizer processing
Returns:
torch.Tensor: sentence embedding
"""
outputs = self(items)
item_embeds = torch.cat(
(outputs.forward_output[torch.arange(len(items["seq_len"])), torch.tensor(items["seq_len"]) - 1],
Expand All @@ -29,6 +38,15 @@ def infer_vector(self, items: dict, **kwargs) -> torch.Tensor:
return item_embeds

def infer_tokens(self, items, **kwargs) -> torch.Tensor:
"""
get tokens embedding with ElmoModel
Parameters
----------
items: dict, {'seq_idx': tensor()}, the tokens about question after tokenizer processing
Returns:
torch.Tensor: token embedding
"""
outputs = self(items)
forward_hiddens = outputs.forward_output
backward_hiddens = outputs.backward_output
Expand Down
58 changes: 58 additions & 0 deletions EduNLP/Vector/gensim_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,32 @@ def __getitem__(self, item):
return self.wv[item] if index not in self.constants.values() else np.zeros((self.vector_size,))

def infer_vector(self, items, agg="mean", **kwargs) -> list:
"""
get sentence embedding with word2vec model
Parameters
----------
item: list, the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
token_vectors = self.infer_tokens(items, **kwargs)
# return [eval("np.%s" % agg)(item, axis=0) if item else np.array([]) for item in token_vectors]
return [eval("np.%s" % agg)(item, axis=0) if item else np.zeros(self.vector_size,) for item in token_vectors]

def infer_tokens(self, items, **kwargs) -> list:
"""
get token embedding with word2vec model
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[[array(), ..., array()], [...], [...]]
"""
return [list(self(*item)) for item in items]


Expand All @@ -95,6 +116,17 @@ def __init__(self, filepath):
self.dictionary = corpora.Dictionary.load(filepath)

def infer_vector(self, item, return_vec=False):
"""
get Bow vector
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
item = self.dictionary.doc2bow(item)
if not return_vec:
return item # return dic as default
Expand All @@ -121,6 +153,17 @@ def __init__(self, filepath):
self.dictionary = corpora.Dictionary.load(dictionary_path)

def infer_vector(self, item, return_vec=False):
"""
get Tf-idf vector
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
dic_item = self.dictionary.doc2bow(item)
tfidf_item = self.tfidf_model[dic_item]
# return dic as default
Expand Down Expand Up @@ -181,7 +224,22 @@ def vector_size(self):
return self.d2v.vector_size

def infer_vector(self, items, *args, **kwargs) -> list:
"""
get vector with D2V model
Parameters
----------
item: list
the tokens after tokenizer processing
Return
------
vector: list
[array(), ..., array()]
"""
return [self(item) for item in items]

def infer_tokens(self, item, *args, **kwargs) -> ...:
"""
get token embeddings with D2V
NotImplemented
"""
raise NotImplementedError
43 changes: 42 additions & 1 deletion EduNLP/Vector/t2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ class T2V(object):
Examples
--------
>>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$,\
>>> item = [{'ques_content':'有公式$\\FormFigureID{wrong1?}$和公式$\\FormFigureBase64{wrong2?}$, \
... 如图$\\FigureID{088f15ea-8b7c-11eb-897e-b46bfc50aa29}$,若$x,y$满足约束条件$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$'}]
>>> model_dir = "examples/test_model/d2v"
>>> url, model_name, *args = get_pretrained_model_info('d2v_test_256')
Expand All @@ -69,9 +69,24 @@ def __call__(self, items, *args, **kwargs):
return self.i2v.infer_vector(items, *args, **kwargs)

def infer_vector(self, items, *args, **kwargs):
"""
get question embedding with T2V
Parameters
----------
items:list
a list of question
Returns
-------
vector:list
numpy.ndarray([dtype=float32)]
"""
return self.i2v.infer_vector(items, *args, **kwargs)

def infer_tokens(self, items, *args, **kwargs):
"""
get token embeddings with T2V
NotImplemented
"""
return self.i2v.infer_tokens(items, *args, **kwargs)

@property
Expand All @@ -80,6 +95,24 @@ def vector_size(self) -> int:


def get_pretrained_model_info(name):
"""
get the pretrained model information with the given name
Parameters
----------
name:str
select the pretrained model
e.g.:
d2v_math_300
w2v_math_300
elmo_math_2048
bert_math_768
bert_taledu_768
disenq_math_256
quesnet_math_512
Returns
--------
list: [model url (where to download), model name]
"""
url = MODELHUB_URL + 'getPretrainedModel'
param = {'name': name}
r = requests.get(url, params=param)
Expand All @@ -89,6 +122,14 @@ def get_pretrained_model_info(name):


def get_all_pretrained_models():
"""
get all pretrained models' name
Returns
-------
the pretrained models' name:list
e.g.['bert_bio_ptc', 'bert_geo_ptc', 'bert_math_768', ... ]
"""
url = MODELHUB_URL + 'getPretrainedModelList'
r = requests.get(url)
assert r.status_code == 200, r.status_code
Expand Down
7 changes: 6 additions & 1 deletion docs/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,26 +3,31 @@ EduNLP document and tutorial folder

Requirements
------------

See the requirements `docs_deps` in `setup.py`:

```sh
pip install -e .[doc]
```


Build documents
---------------

First, clean up existing files:

```
make clean
```

Then build:

```
make html
```

Render locally
--------------

```
cd build/html
python3 -m http.server 8000
Expand Down
4 changes: 2 additions & 2 deletions docs/source/api/vector.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@ EduNLP.Vector.t2v


EduNLP.Vector.disenqnet
--------------------
-------------------------

.. automodule:: EduNLP.Vector.disenqnet.disenqnet
:members:

EduNLP.Vector.quesnet
--------------------
-------------------------

.. automodule:: EduNLP.Vector.quesnet.quesnet
:members:
Expand Down
2 changes: 2 additions & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,5 @@ def copy_tree(src, tar):
'undoc-members': True,
}
autodoc_member_order = 'bysource'

nbsphinx_allow_errors = True
8 changes: 3 additions & 5 deletions docs/source/tutorial/zh/pipeline.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@
流水线
=======

.. nbgallery::
:caption: This is a thumbnail gallery:
:name: pipleine_gallery
:glob:
.. nbinfo::
notebook:

流水线 <../../build/blitz/pipeline/pipeline.ipynb>
`流水线 <../../build/blitz/pipeline/pipeline.ipynb>`_
Loading

0 comments on commit 5f2f71f

Please sign in to comment.