-
Notifications
You must be signed in to change notification settings - Fork 1
/
name_embedding.py
103 lines (93 loc) · 3.14 KB
/
name_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import torch
from torch import nn
from transformers import BertModel,BertConfig,BertTokenizer
import numpy as np
import re
import pickle
import jieba
import torch.nn as nn
pretrained_path = 'bert-multi/'
config = BertConfig.from_pretrained(pretrained_path)
tokenizer = BertTokenizer.from_pretrained(pretrained_path)
model = BertModel.from_pretrained(pretrained_path,config=config)
def gen_mean(vals, p):
p = float(p)
return np.power(
np.mean(
np.power(
np.array(vals, dtype=complex),
p),
axis=0),
1 / p
)
operations = dict([('mean', (lambda word_embeddings: [np.mean(word_embeddings, axis=0)], lambda embeddings_size: embeddings_size)),])
def get_sentence_embedding(embeddings, chosen_operations, con):
word_embeddings = embeddings[0,:,:].detach().numpy()
for o in chosen_operations:
concat_embs = operations[o][0](word_embeddings)
sentence_embedding = np.concatenate([concat_embs[0],[0]*132])
return sentence_embedding, con
def softmax(x, axis=None):
x = x - x.max(axis=axis, keepdims=True)
y = np.exp(x)
return y / y.sum(axis=axis, keepdims=True)
id2embed1 = dict()
id2embed2 = dict()
id2name = dict()
path='./data/DBP15K/dbp_wd/'
inf = open(path + 'ent_ids_1')
con = 0
for i1, line in enumerate(inf):
strs = line.strip().split('\t')
id2name[int(strs[0])] = strs[1]
wordline = strs[1].split('/')[-1].lower().replace('_',' ')
words = re.findall(r'\b\w+\b', wordline)
words_new=''
for i in words:
for j in i:
words_new=words_new+j
batch = tokenizer.encode_plus(words_new)
input_ids = torch.tensor([batch['input_ids']])
token_type_ids = torch.tensor([batch['token_type_ids']])
attention_mask = torch.tensor([batch['attention_mask']])
embedding = model(input_ids,token_type_ids=token_type_ids)
embed, con = get_sentence_embedding(embedding[0], ['mean'], con)
id2embed1[strs[0]] = embed
con1 = 0
inf = open(path + 'ent_ids_2')
for i2, line in enumerate(inf):
strs = line.strip().split('\t')
id2name[int(strs[0])] = strs[1]
wordline = strs[1].split('/')[-1].lower().replace('_', ' ')
words = re.findall(r'\b\w+\b', wordline)
words_new=''
for i in words:
for j in i:
words_new=words_new+j
batch = tokenizer.encode_plus(words_new)
input_ids = torch.tensor([batch['input_ids']])
token_type_ids = torch.tensor([batch['token_type_ids']])
attention_mask = torch.tensor([batch['attention_mask']])
embedding = model(input_ids,token_type_ids=token_type_ids)
embed, con1 = get_sentence_embedding(embedding[0], ['mean'], con1)
id2embed2[strs[0]] = embed
outf = open(path + 'name_vec1.txt', 'w')
outf.truncate(0)
for id in id2embed1:
embed=id2embed1[id]
dis_str = ''
for i in embed:
dis_str = dis_str+ str(i) + ' '
dis_str = dis_str[:-1]
outf.write(dis_str + '\n')
outf.close()
outf = open(path + 'name_vec2.txt', 'w')
outf.truncate(0)
for id in id2embed2:
embed=id2embed2[id]
dis_str = ''
for i in embed:
dis_str = dis_str+ str(i) + ' '
dis_str = dis_str[:-1]
outf.write(dis_str + '\n')
outf.close()