From 48d7526197d9f7f65a8110b56ef3b2c8eeeaef95 Mon Sep 17 00:00:00 2001 From: Caglar Demir Date: Wed, 16 Feb 2022 14:34:31 +0100 Subject: [PATCH] Major update: KGE & unused paramters are removed. --- setup.py | 3 +- tests/test_QCUT.py | 17 +++------ vectograph/helper_classes.py | 71 ------------------------------------ vectograph/helper_funcs.py | 4 -- vectograph/kge_models.py | 33 ----------------- vectograph/transformers.py | 15 +------- 6 files changed, 7 insertions(+), 136 deletions(-) delete mode 100644 vectograph/helper_classes.py delete mode 100644 vectograph/helper_funcs.py delete mode 100644 vectograph/kge_models.py diff --git a/setup.py b/setup.py index 1c76bad..ca7cbfe 100644 --- a/setup.py +++ b/setup.py @@ -9,8 +9,7 @@ install_requires=['scikit-learn>=0.22.1', 'pytest', 'rdflib', - 'pandas>=1.0.3', - 'torch'], + 'pandas>=1.0.3'], author='Caglar Demir', author_email='caglardemir8@gmail.com', classifiers=[ diff --git a/tests/test_QCUT.py b/tests/test_QCUT.py index 4d53a88..2a49b34 100644 --- a/tests/test_QCUT.py +++ b/tests/test_QCUT.py @@ -6,19 +6,12 @@ class TestDefault: def test_default_QCUT(self): + X, y = datasets.fetch_california_housing(return_X_y=True) - n, m = X.shape - X_transformed = QCUT(min_unique_val_per_column=2, num_quantile=5).transform(pd.DataFrame(X)) + X_transformed = QCUT(min_unique_val_per_column=6, num_quantile=5).transform(pd.DataFrame(X)) # Add prefix X_transformed.index = 'Event_' + X_transformed.index.astype(str) + kg = GraphGenerator().transform(X_transformed) - gg = GraphGenerator() - kg = gg.transform(X_transformed) - assert len(kg) == (n * m) - new_kg = [] - with open(gg.path, 'r') as read: - for i in read: - s, p, o, = i.split() - new_kg.append((s, p, o)) - - assert kg == new_kg + for s, p, o in kg: + pass #print(s, p, o) diff --git a/vectograph/helper_classes.py b/vectograph/helper_classes.py deleted file mode 100644 index dc35293..0000000 --- a/vectograph/helper_classes.py +++ /dev/null @@ -1,71 +0,0 @@ -from collections import defaultdict -import numpy as np -import torch - - -class Data: - - def __init__(self, data_path: str): - self.triples = self.parse_data(data_path) - self.cuda = False - self.entities = self.get_entities(self.triples) - self.tails = self.get_tails(self.triples) - self.relations = self.get_relations(self.triples) - - self.entity_idxs = {self.entities[i]: i for i in range(len(self.entities))} - self.relation_idxs = {self.relations[i]: i for i in range(len(self.relations))} - self.train_data_idxs = self.get_data_idxs(self.triples) - - def get_data_idxs(self, data): - data_idxs = [(self.entity_idxs[data[i][0]], self.relation_idxs[data[i][1]], self.entity_idxs[data[i][2]]) for i - in range(len(data))] - return data_idxs - - @staticmethod - def parse_data(data_path): - import re - data = [] - with open(data_path, "r") as f: - for triple in f: - if '"' in triple or "'" in triple or len(triple)==1: - continue - components = re.findall('<(.+?)>', triple) - try: - assert len(components) == 3 - except AssertionError: - print(triple) - print(len(triple)) - exit(1) - data.append(components) - return data - - @staticmethod - def get_relations(data): - relations = sorted(list(set([d[1] for d in data]))) - return relations - - @staticmethod - def get_entities(data): - entities = sorted(list(set([d[0] for d in data] + [d[2] for d in data]))) - return entities - - @staticmethod - def get_tails(data): - tails = sorted(list(set([d[2] for d in data]))) - return tails - - def get_er_vocab(self, data): - er_vocab = defaultdict(list) - for triple in data: - er_vocab[(triple[0], triple[1])].append(triple[2]) - return er_vocab - - def get_batch(self, er_vocab, er_vocab_pairs, idx, batch_size): - batch = er_vocab_pairs[idx:idx + batch_size] - targets = np.zeros((len(batch), len(self.entities))) - for idx, pair in enumerate(batch): - targets[idx, er_vocab[pair]] = 1. - targets = torch.FloatTensor(targets) - if self.cuda: - targets = targets.cuda() - return np.array(batch), targets diff --git a/vectograph/helper_funcs.py b/vectograph/helper_funcs.py deleted file mode 100644 index 664605e..0000000 --- a/vectograph/helper_funcs.py +++ /dev/null @@ -1,4 +0,0 @@ -import os -def apply_PYKE(t): - g, path ,params= t - os.system("python PYKE/execute.py --kg_path {0} --embedding_dim {1} --eval True".format(path,params['embedding_dim'])) diff --git a/vectograph/kge_models.py b/vectograph/kge_models.py deleted file mode 100644 index 56b16bd..0000000 --- a/vectograph/kge_models.py +++ /dev/null @@ -1,33 +0,0 @@ -import torch -from torch.nn.init import xavier_normal_ - - -class Distmult(torch.nn.Module): - def __init__(self, params): - super(Distmult, self).__init__() - self.name = 'Distmult' - self.emb_e = torch.nn.Embedding(params['num_entities'], params['embedding_dim'], padding_idx=0) - self.emb_rel = torch.nn.Embedding(params['num_relations'], params['embedding_dim'], padding_idx=0) - self.inp_drop = torch.nn.Dropout(params['input_dropout']) - self.loss = torch.nn.BCELoss() - - self.bn0 = torch.nn.BatchNorm1d(params['embedding_dim']) - self.bn1 = torch.nn.BatchNorm1d(params['embedding_dim']) - - def init(self): - xavier_normal_(self.emb_e.weight.data) - xavier_normal_(self.emb_rel.weight.data) - - def forward(self, e1, rel): - e1_embedded = self.emb_e(e1) - rel_embedded = self.emb_rel(rel) - e1_embedded = e1_embedded.squeeze() - rel_embedded = rel_embedded.squeeze() - - e1_embedded = self.bn0(self.inp_drop(e1_embedded)) - rel_embedded = self.bn1(self.inp_drop(rel_embedded)) - - pred = torch.mm(e1_embedded * rel_embedded, self.emb_e.weight.transpose(1, 0)) - pred = torch.sigmoid(pred) - - return pred \ No newline at end of file diff --git a/vectograph/transformers.py b/vectograph/transformers.py index 6c28065..0399a5c 100644 --- a/vectograph/transformers.py +++ b/vectograph/transformers.py @@ -1,20 +1,7 @@ from sklearn.base import BaseEstimator, TransformerMixin from rdflib import Graph, URIRef, Namespace # basic RDF handling -from .kge_models import * -from collections import Counter, defaultdict -from .helper_classes import Data -import torch -import numpy as np import pandas as pd -from typing import Dict, Tuple, List -import math -import random -from scipy.spatial.distance import cosine -from sklearn.neighbors import NearestNeighbors -import os -import itertools -from .utils import ignore_columns, create_experiment_folder, create_logger - +from typing import List class RDFGraphCreator(BaseEstimator, TransformerMixin): """