Skip to content

Commit

Permalink
Upate scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
KenelmQLH committed Feb 22, 2024
1 parent 2b2fbe1 commit 5e32928
Show file tree
Hide file tree
Showing 30 changed files with 814 additions and 498 deletions.
10 changes: 5 additions & 5 deletions EduNLP/I2V/i2v.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ class I2V(object):
(...)
>>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True)
>>> i2v = D2V("pure_text", "d2v", filepath=path, pretrained_t2v=False)
>>> i2v(item)
([array([ ...dtype=float32)], None)
>>> i2v(item) # doctest: +SKIP
([array([ ...dtype=float32)], [[array([ ...dtype=float32)]])
Returns
-------
Expand Down Expand Up @@ -189,8 +189,8 @@ class D2V(I2V):
(...)
>>> path = path_append(path, os.path.basename(path) + '.bin', to_str=True)
>>> i2v = D2V("pure_text","d2v",filepath=path, pretrained_t2v = False)
>>> i2v(item)
([array([ ...dtype=float32)], None)
>>> i2v(item) # doctest: +SKIP
# ([array([ ...dtype=float32)], [[array([ ...dtype=float32)]])
Returns
-------
Expand Down Expand Up @@ -579,7 +579,7 @@ def get_pretrained_i2v(name, model_dir=MODEL_DIR, device='cpu'):
>>> (); i2v = get_pretrained_i2v("d2v_test_256", "examples/test_model/d2v"); () # doctest: +SKIP
(...)
>>> print(i2v(item)) # doctest: +SKIP
([array([ ...dtype=float32)], None)
([array([ ...dtype=float32)], [[array([ ...dtype=float32)]])
"""
pretrained_models = get_all_pretrained_models()
if name not in pretrained_models:
Expand Down
13 changes: 7 additions & 6 deletions EduNLP/ModelZoo/hf_model/hf_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,11 @@
from torch import nn
import json
import os
from ..base_model import BaseModel
from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput
from transformers import AutoModel, PretrainedConfig, AutoConfig
from typing import List
from EduNLP.utils.log import logger
from ..base_model import BaseModel
from ..utils import PropertyPredictionOutput, KnowledgePredictionOutput
from ..rnn.harnn import HAM


Expand All @@ -17,10 +18,10 @@ def __init__(self, pretrained_model_dir=None, head_dropout=0.5, init=True):
super(HfModelForPropertyPrediction, self).__init__()
bert_config = AutoConfig.from_pretrained(pretrained_model_dir)
if init:
print(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
logger.info(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
self.bert = AutoModel.from_pretrained(pretrained_model_dir)
else:
print(f'Load AutoModel from config: {pretrained_model_dir}')
logger.info(f'Load AutoModel from config: {pretrained_model_dir}')
self.bert = AutoModel(bert_config)
self.hidden_size = self.bert.config.hidden_size
self.head_dropout = head_dropout
Expand Down Expand Up @@ -86,10 +87,10 @@ def __init__(self,
super(HfModelForKnowledgePrediction, self).__init__()
bert_config = AutoConfig.from_pretrained(pretrained_model_dir)
if init:
print(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
logger.info(f'Load AutoModel from checkpoint: {pretrained_model_dir}')
self.bert = AutoModel.from_pretrained(pretrained_model_dir)
else:
print(f'Load AutoModel from config: {pretrained_model_dir}')
logger.info(f'Load AutoModel from config: {pretrained_model_dir}')
self.bert = AutoModel(bert_config)
self.hidden_size = self.bert.config.hidden_size
self.head_dropout = head_dropout
Expand Down
63 changes: 6 additions & 57 deletions EduNLP/ModelZoo/quesnet/quesnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,19 +116,13 @@ def make_batch(self, data, device, pretrain=False):
for q in data:
meta = torch.zeros(len(self.stoi[self.meta])).to(device)
meta[q.labels.get(self.meta) or []] = 1
_lembs = [
# self.we(torch.tensor([0], device=device)),
# self.we(torch.tensor([0], device=device)),
torch.zeros(1, self.emb_size).to(device),
torch.zeros(1, self.emb_size).to(device),
self.me.enc(meta.unsqueeze(0)) * self.lambda_input[2]]
_lembs = [torch.zeros(1, self.emb_size).to(device),
torch.zeros(1, self.emb_size).to(device),
self.me.enc(meta.unsqueeze(0)) * self.lambda_input[2]]
_rembs = [self.me.enc(meta.unsqueeze(0)) * self.lambda_input[2]]
_embs = [
# self.we(torch.tensor([0], device=device)),
# self.we(torch.tensor([0], device=device)),
torch.zeros(1, self.emb_size).to(device),
torch.zeros(1, self.emb_size).to(device),
self.me.enc(meta.unsqueeze(0)) * self.lambda_input[2]]
_embs = [torch.zeros(1, self.emb_size).to(device),
torch.zeros(1, self.emb_size).to(device),
self.me.enc(meta.unsqueeze(0)) * self.lambda_input[2]]
_gt = [torch.tensor([0], device=device), meta]
for w in q.content:
if isinstance(w, int):
Expand All @@ -148,10 +142,6 @@ def make_batch(self, data, device, pretrain=False):
_embs.append(item)
_gt.append(im)
_gt.append(torch.tensor([0], device=device))
# _rembs.append(self.we(torch.tensor([0], device=device)))
# _embs.append(self.we(torch.tensor([0], device=device)))
# _embs.append(self.we(torch.tensor([1], device=device)))
# _rembs.append(self.we(torch.tensor([1], device=device)))
_rembs.append(torch.zeros(1, self.emb_size).to(device))
_rembs.append(torch.zeros(1, self.emb_size).to(device))
_embs.append(torch.zeros(1, self.emb_size).to(device))
Expand Down Expand Up @@ -339,51 +329,10 @@ def forward(self, batch):
torch.zeros_like(self.ans_judge(y.data)))
loss = floss * self.lambda_loss[1]
# low-level loss
# left_hid = self.quesnet(left).pack_embeded.data[:, :self.rnn_size].clone()
# right_hid = self.quesnet(right).pack_embeded.data[:, self.rnn_size:].clone()
left_hid = self.quesnet(left).pack_embeded.data[:, :self.rnn_size]
right_hid = self.quesnet(right).pack_embeded.data[:, self.rnn_size:]

wloss = iloss = mloss = None

# if words is not None:
# lwfea = torch.masked_select(left_hid.clone(), wmask.unsqueeze(1).bool()) \
# .view(-1, self.rnn_size).clone()
# lout = self.lwoutput(lwfea.clone())
# rwfea = torch.masked_select(right_hid.clone(), wmask.unsqueeze(1).bool()) \
# .view(-1, self.rnn_size).clone()
# rout = self.rwoutput(rwfea.clone())
# out = self.woutput(torch.cat([lwfea.clone(), rwfea.clone()], dim=1).clone())
# wloss = (F.cross_entropy(out, words) + F.cross_entropy(lout, words) + F.
# cross_entropy(rout, words)) * self.quesnet.lambda_input[0] / 3
# wloss *= self.lambda_loss[0]
# loss = loss + wloss

# if ims is not None:
# lifea = torch.masked_select(left_hid.clone(), imask.unsqueeze(1).bool()) \
# .view(-1, self.rnn_size).clone()
# lout = self.lioutput(lifea.clone())
# rifea = torch.masked_select(right_hid.clone(), imask.unsqueeze(1).bool()) \
# .view(-1, self.rnn_size).clone()
# rout = self.rioutput(rifea.clone())
# out = self.ioutput(torch.cat([lifea.clone(), rifea.clone()], dim=1).clone())
# iloss = (self.quesnet.ie.loss(ims, out) + self.quesnet.ie.loss(ims, lout) + self.quesnet.ie.
# loss(ims, rout)) * self.quesnet.lambda_input[1] / 3
# iloss *= self.lambda_loss[0]
# loss = loss + iloss

# if metas is not None:
# lmfea = torch.masked_select(left_hid.clone(), mmask.unsqueeze(1).bool()) \
# .view(-1, self.rnn_size).clone()
# lout = self.lmoutput(lmfea.clone())
# rmfea = torch.masked_select(right_hid.clone(), mmask.unsqueeze(1).bool()) \
# .view(-1, self.rnn_size).clone()
# rout = self.rmoutput(rmfea.clone())
# out = self.moutput(torch.cat([lmfea.clone(), rmfea.clone()], dim=1).clone())
# mloss = (self.quesnet.me.loss(metas, out) + self.quesnet.me.loss(metas, lout) + self.quesnet.me.
# loss(metas, rout)) * self.quesnet.lambda_input[2] / 3
# mloss *= self.lambda_loss[0]
# loss = loss + mloss
if words is not None:
lwfea = torch.masked_select(left_hid, wmask.unsqueeze(1).bool()) \
.view(-1, self.rnn_size)
Expand Down
4 changes: 2 additions & 2 deletions EduNLP/Pretrain/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
# coding: utf-8
# 2021/5/29 @ tongshiwei

from .gensim_vec import train_vector, GensimWordTokenizer, GensimSegTokenizer
from .gensim_vec import pretrain_vector, GensimWordTokenizer, GensimSegTokenizer
from .elmo_vec import *
from .auto_vec import *
from .bert_vec import *
from .hf_model_vec import *
from .quesnet_vec import QuesNetTokenizer, pretrain_quesnet, Question
from .disenqnet_vec import *
from .pretrian_utils import *
Expand Down
126 changes: 78 additions & 48 deletions EduNLP/Pretrain/hf_model_vec.py → EduNLP/Pretrain/auto_vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,20 @@
from transformers import Trainer, TrainingArguments
from copy import deepcopy

from ..ModelZoo.hf_model import HfModelForPropertyPrediction, HfModelForKnowledgePrediction
from ..ModelZoo.hf_model import (
HfModelForPropertyPrediction,
HfModelForKnowledgePrediction,
)
from .pretrian_utils import EduDataset
from .hugginface_utils import TokenizerForHuggingface

__all__ = ["HfAutoTokenizer", "HfAutoDataset", "finetune_hf_auto_model",
"finetune_hf_auto_model_for_property_prediction",
"finetune_hf_auto_model_for_knowledge_prediction"]
__all__ = [
"EduAutoTokenizer",
"EduAutoDataset",
"finetune_edu_auto_model",
"finetune_edu_auto_model_for_property_prediction",
"finetune_edu_auto_model_for_knowledge_prediction",
]

DEFAULT_TRAIN_PARAMS = {
# default
Expand All @@ -35,11 +42,11 @@
}


class HfAutoTokenizer(TokenizerForHuggingface):
class EduAutoTokenizer(TokenizerForHuggingface):
"""
Examples
----------
>>> tokenizer = HfAutoTokenizer(add_special_tokens=True)
>>> tokenizer = EduAutoTokenizer(add_special_tokens=True)
>>> item = "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$,\
... 若$x,y$满足约束条件公式$\\FormFigureBase64{wrong2?}$,$\\SIFSep$,则$z=x+7 y$的最大值为$\\SIFBlank$"
>>> token_item = tokenizer(item)
Expand All @@ -56,17 +63,25 @@ class HfAutoTokenizer(TokenizerForHuggingface):
>>> print(len(tokenizer.tokenize(items)))
2
>>> tokenizer.save_pretrained('test_dir') # doctest: +SKIP
>>> tokenizer = HfAutoTokenizer.from_pretrained('test_dir') # doctest: +SKIP
>>> tokenizer = EduAutoTokenizer.from_pretrained('test_dir') # doctest: +SKIP
"""

pass


class HfAutoDataset(EduDataset):
class EduAutoDataset(EduDataset):
pass


def finetune_hf_auto_model(items: Union[List[dict], List[str]], output_dir: str, pretrained_model="bert-base-chinese",
tokenizer_params=None, data_params=None, model_params=None, train_params=None):
def finetune_edu_auto_model(
items: Union[List[dict], List[str]],
output_dir: str,
pretrained_model="bert-base-chinese",
tokenizer_params=None,
data_params=None,
model_params=None,
train_params=None,
):
"""
Parameters
----------
Expand All @@ -90,7 +105,7 @@ def finetune_hf_auto_model(items: Union[List[dict], List[str]], output_dir: str,
----------
>>> stems = ["有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$",
... "有公式$\\FormFigureID{wrong1?}$,如图$\\FigureID{088f15ea-xxx}$"]
>>> finetune_hf_auto_model(stems, "examples/test_model/data/data/bert") # doctest: +SKIP
>>> finetune_edu_auto_model(stems, "examples/test_model/data/data/bert") # doctest: +SKIP
{'train_runtime': ..., ..., 'epoch': 1.0}
"""
tokenizer_params = tokenizer_params if tokenizer_params else {}
Expand All @@ -99,24 +114,25 @@ def finetune_hf_auto_model(items: Union[List[dict], List[str]], output_dir: str,
train_params = train_params if train_params is not None else {}
# tokenizer configuration
if os.path.exists(pretrained_model):
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = EduAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
else:
work_tokenizer_params = {
"add_specials": True,
"tokenize_method": "pure_text",
}
work_tokenizer_params.update(tokenizer_params)
tokenizer = HfAutoTokenizer(pretrained_model, **work_tokenizer_params)
tokenizer = EduAutoTokenizer(pretrained_model, **work_tokenizer_params)
# TODO: tokenizer.set_vocab()
# model configuration
model = AutoModelForMaskedLM.from_pretrained(pretrained_model, **model_params)
# resize embedding for additional special tokens
model.resize_token_embeddings(len(tokenizer.bert_tokenizer))

# dataset configuration
dataset = HfAutoDataset(tokenizer, items=items,
stem_key=data_params.get("stem_key", None))
mlm_probability = train_params.pop('mlm_probability', 0.15)
dataset = EduAutoDataset(
tokenizer, items=items, stem_key=data_params.get("stem_key", None)
)
mlm_probability = train_params.pop("mlm_probability", 0.15)
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer.bert_tokenizer, mlm=True, mlm_probability=mlm_probability
)
Expand All @@ -137,15 +153,16 @@ def finetune_hf_auto_model(items: Union[List[dict], List[str]], output_dir: str,
tokenizer.save_pretrained(output_dir)


def finetune_hf_auto_model_for_property_prediction(train_items,
output_dir,
pretrained_model="bert-base-chinese",
eval_items=None,
tokenizer_params=None,
data_params=None,
train_params=None,
model_params=None
):
def finetune_edu_auto_model_for_property_prediction(
train_items,
output_dir,
pretrained_model="bert-base-chinese",
eval_items=None,
tokenizer_params=None,
data_params=None,
train_params=None,
model_params=None,
):
"""
Parameters
----------
Expand All @@ -170,15 +187,21 @@ def finetune_hf_auto_model_for_property_prediction(train_items,
model_params = model_params if model_params is not None else {}
train_params = train_params if train_params is not None else {}
# tokenizer configuration
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = EduAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
# dataset configuration
train_dataset = HfAutoDataset(tokenizer=tokenizer, items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "difficulty"))
train_dataset = EduAutoDataset(
tokenizer=tokenizer,
items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "difficulty"),
)
if eval_items is not None:
eval_dataset = HfAutoDataset(tokenizer=tokenizer, items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "difficulty"))
eval_dataset = EduAutoDataset(
tokenizer=tokenizer,
items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "difficulty"),
)
else:
eval_dataset = None
# model configuration
Expand All @@ -205,15 +228,16 @@ def finetune_hf_auto_model_for_property_prediction(train_items,
tokenizer.save_pretrained(output_dir)


def finetune_hf_auto_model_for_knowledge_prediction(train_items,
output_dir,
pretrained_model="bert-base-chinese",
eval_items=None,
tokenizer_params=None,
data_params=None,
train_params=None,
model_params=None
):
def finetune_edu_auto_model_for_knowledge_prediction(
train_items,
output_dir,
pretrained_model="bert-base-chinese",
eval_items=None,
tokenizer_params=None,
data_params=None,
train_params=None,
model_params=None,
):
"""
Parameters
----------
Expand All @@ -238,15 +262,21 @@ def finetune_hf_auto_model_for_knowledge_prediction(train_items,
model_params = model_params if model_params is not None else {}
train_params = train_params if train_params is not None else {}
# tokenizer configuration
tokenizer = HfAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
tokenizer = EduAutoTokenizer.from_pretrained(pretrained_model, **tokenizer_params)
# dataset configuration
train_dataset = HfAutoDataset(tokenizer=tokenizer, items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "know_list"))
train_dataset = EduAutoDataset(
tokenizer=tokenizer,
items=train_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "know_list"),
)
if eval_items is not None:
eval_dataset = HfAutoDataset(tokenizer=tokenizer, items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "know_list"))
eval_dataset = EduAutoDataset(
tokenizer=tokenizer,
items=eval_items,
stem_key=data_params.get("stem_key", "ques_content"),
label_key=data_params.get("label_key", "know_list"),
)
else:
eval_dataset = None
# model configuration
Expand Down
Loading

0 comments on commit 5e32928

Please sign in to comment.