forked from JY3150/social-media-influence-analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
275 lines (236 loc) · 10 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
import math
from DAO.DAOFactory import DAOFactory
from UserPartitioning import UserPartitioningStrategyFactory
from Mapping.MappingFactory import MappingFactory
from Builder.ContentMarketBuilder import ContentMarketBuilder
from Builder.ContentSpaceBuilder import ContentSpaceBuilder
from Builder.ContentDemandSupplyBuilder import ContentDemandSupplyBuilder
from TS.SimpleTimeSeriesBuilder import SimpleTimeSeriesBuilder
from TS.MATimeSeriesBuilder import MATimeSeriesBuilder
from TS.FractionTimeSeriesBuilder import FractionTimeSeriesBuilder
from TS.SupplyCentricMATimeSeriesBuilder import SupplyCentricMATimeSeriesBuilder
from TS.SupplyCentricTimeSeriesBuilder import SupplyCentricTimeSeriesBuilder
from TS.SupplyAdvanceTimeSeriesBuilder import SupplyAdvanceTimeSeriesBuilder
from TS.FractionTimeSeriesConverter import FractionTimeSeriesConverter
from Causality.CausalityAnalysisTool import *
from Causality.BinningCausalityAnalysis import BinningCausalityAnalysis
from Causality.RankCausalityAnalysis import RankCausalityAnalysis
from User.UserType import UserType
from Visualization.KmersPlotter import KmersPlotter
from Visualization.CreatorPlotter import CreatorPlotter
from Visualization.BinningPlotter import BinningPlotter
import os
import json
import pickle
from datetime import datetime, timedelta
import numpy as np
import matplotlib.pyplot as plt
# import nltk
# nltk.download('averaged_perceptron_tagger')
#%% ###################### Parameter Setup ######################
# Load configuration
config_file_path = "../config.json"
config_file = open(config_file_path)
config = json.load(config_file)
config_file.close()
# Load from database
MARKET_LOAD = False
SPACE_LOAD = False
DEMAND_SUPPLY_LOAD = False
# Store to database
MARKET_STORE = False
SPACE_STORE = False
DEMAND_SUPPLY_STORE = False
# Skip Building
MARKET_SKIP = False
SPACE_SKIP = False
#%% ################### Build DAO Factory and Partitioning ###################
dao_factory = DAOFactory()
partition = UserPartitioningStrategyFactory.get_user_type_strategy(
config["partitioning_strategy"])
#%% ######################### Build Content Market #########################
if not MARKET_SKIP:
market_dao = dao_factory.get_content_market_dao(config["database"])
market_builder = ContentMarketBuilder(
config["database"]["content_market_db_name"],
market_dao, partition)
if MARKET_LOAD:
market = market_builder.load()
else:
market = market_builder.create()
if MARKET_STORE:
market_builder.store(market)
# core_node_id = 18104734
# market.preserve_core_node(core_node_id)
#%% ######################### Build Content Space #########################
if not SPACE_SKIP:
# Build ContentType Mapping
space_dao = dao_factory.get_content_space_dao(config["database"])
mapping = None
if not SPACE_LOAD:
print("=================Creating Mapping=================")
mapping_factory = MappingFactory(config["content_type_method"])
mapping = mapping_factory.get_cluster({
"num_clusters": config["num_clusters"],
"embeddings": market_dao.load_tweet_embeddings(),
# "words": ["chess"],
"num_bins": config["num_bins"],
"market": market
})
mapping.generate_tweet_to_type()
pickle.dump(mapping, open("binning_mapping.pkl", "wb"))
# mapping = pickle.load(open("binning_mapping.pkl", "rb"))
# print("=================Loading Mapping=================")
# Build Content Space
if SPACE_LOAD:
space_builder = ContentSpaceBuilder(
config["database"]["content_space_db_name"],
space_dao, partition)
space = space_builder.load()
else:
space_builder = ContentSpaceBuilder(
config["database"]["content_space_db_name"],
space_dao, partition, market, mapping)
space = space_builder.create()
if SPACE_STORE:
space_builder.store(space)
#%% ######################### Build Demand and Supply #########################
ds_dao = dao_factory.get_supply_demand_dao(config["database"])
if DEMAND_SUPPLY_LOAD:
ds_builder = ContentDemandSupplyBuilder(
config["database"]["content_demand_supply_db_name"], ds_dao)
ds = ds_builder.load()
else:
ds_builder = ContentDemandSupplyBuilder(
config["database"]["content_demand_supply_db_name"], ds_dao, space)
ds = ds_builder.create()
if DEMAND_SUPPLY_STORE:
ds_builder.store(ds)
#%% #################### Plotting ####################
# plotter = BinningPlotter(ds)
# plotter.create_mapping_curves(save=True)
#%% ######################### Build Time Series #########################
start = datetime(2020, 9, 1)
end = datetime(2023, 2, 1)
period = timedelta(days=2)
# ts_builder = SupplyCentricTimeSeriesBuilder(ds, space, start, end, period, 0)
ts_builder = SimpleTimeSeriesBuilder(ds, space, start, end, period)
# ts_builder = MATimeSeriesBuilder(ds, space, start, end, period, timedelta(days=3))
# ts_builder = SupplyCentricMATimeSeriesBuilder(ds, space, start, end, period, timedelta(days=3), 0)
# ts_builder = FractionTimeSeriesConverter(ts_builder)
#%% Agg Supply and Demand analysis
for combi in [[6, 9], [11, 13], [9, 13], [13, 15], [13, 17], [15, 17]]:
ols_for_bins(ts_builder, combi, 4)
exit(0)
consumer_demand = ts_builder.create_all_type_time_series(UserType.CONSUMER, "demand_in_community")
core_node_demand = ts_builder.create_all_type_time_series(UserType.CORE_NODE, "demand_in_community")
core_node_supply = ts_builder.create_all_type_time_series(UserType.CORE_NODE, "supply")
producer_supply = ts_builder.create_all_type_time_series(UserType.PRODUCER, "supply")
demand = np.add(consumer_demand, core_node_demand)
supply = np.add(producer_supply, core_node_supply)
log_demand = [math.log(num) if num > 0 else 0 for num in demand]
#
# demand = ts_builder.create_agg_time_series(11, 'demand_in_community')
# supply = ts_builder.create_agg_time_series(11, "supply")
n = int(3 / 4 * len(consumer_demand))
tsample, ttarget = demand[:n], supply[:n]
tstsample, tsttarget = demand[n:], supply[n:]
import pandas as pd
import statsmodels.api as sm
data = pd.DataFrame({
'x': tsample,
'y': ttarget
})
lag = 3
# Create lagged variables
for i in range(1, lag+1):
data[f'x_lag_{i}'] = data['x'].shift(i)
# Drop rows with NaN values introduced by the lag
data = data.dropna()
# Define the independent variables (including the constant term)
X = data[[f'x_lag_{i}' for i in range(1, lag+1)]]
# Define the dependent variable
y = data['y']
# Fit the OLS regression model with lagged variables
model = sm.OLS(y, X).fit()
# Get the regression results
results = model.summary()
# Print the results
print(results)
coef = [model.params[f'x_lag_{i}'] for i in range(1, lag + 1)]
prediction = [coef[0] * tstsample[i] + coef[1] * tstsample[i - 1] + coef[2] * tstsample[i-2] for i in range(2, len(tstsample))]
plt.figure()
plt.plot(ts_builder.get_time_stamps()[n + 2:], prediction, label="prediction")
plt.plot(ts_builder.get_time_stamps()[n + 2:], tsttarget[2:], label="target")
plt.legend()
plt.gcf().autofmt_xdate()
title = "prediction and target ds, period=1"
plt.title(title)
plt.savefig("../results/" + title)
prediction = [coef[0] * tsample[i] + coef[1] * tsample[i - 1] +
coef[2] * tsample[i-2] for i in range(2, len(tsample))]
plt.figure()
plt.plot(ts_builder.get_time_stamps()[2:len(tsample)], prediction, label="prediction")
plt.plot(ts_builder.get_time_stamps()[2:len(tsample)], ttarget[2:], label="target")
plt.legend()
plt.gcf().autofmt_xdate()
title = "prediction and target ds train, period=1"
plt.title(title)
plt.savefig("../results/" + title)
# plot time series
plt.figure()
plt.plot(ts_builder.get_time_stamps(), demand, label="demand")
plt.plot(ts_builder.get_time_stamps(), supply, label="supply")
plt.legend()
plt.gcf().autofmt_xdate()
title = "Aggregate Supply and Demand MAF"
plt.title(title)
plt.savefig("../results/" + title)
# Granger Causality
lags = list(range(-10, 11))
plt.figure()
gc = gc_score_for_lags(prediction, ttarget[2:], lags)
plt.plot(lags, gc)
plt.xticks(lags)
plt.xlabel("lags")
plt.ylabel("p-value")
plt.axhline(y=0.05, color="red", linestyle="--")
title = "granger causality for prediction and target"
plt.title(title)
plt.savefig("../results/" + title)
# Results for structural properties
from analysis import original_tweets_to_retweets_ratio, plot_social_support_rank_and_value, \
plot_social_support_and_number_of_followers, plot_social_support_and_number_of_followings, \
plot_rank_binned_followings, plot_bhattacharyya_distances, \
plot_social_support_rank_and_retweets, relative_frequency_months
original_tweets_to_retweets_ratio(market)
plot_social_support_rank_and_value(space, [False, False, False, True])
plot_social_support_and_number_of_followers(space)
plot_social_support_and_number_of_followings(space)
plot_rank_binned_followings(space, bin_size=20, log=False)
plot_social_support_rank_and_value(space, [True, True, False, False])
print(len(space.retweets_of_in_comm))
print(len(space.retweets_of_out_comm_by_in_comm))
plot_social_support_rank_and_retweets(space)
# Bhattacharyya
plot_bhattacharyya_distances(space, ds)
# NMF topic modelling
from nmf import preprocess, find_good_num_topics, create_nmf_model, plot_top_words
tweets = sorted(list(market.original_tweets.union(market.retweets_of_in_comm)),
key=lambda tweet: tweet.id)
print("Length of tweets: " + str(len(tweets)))
# Preprocess
print("Creating corpus..." + str(datetime.now()))
texts, corpus = preprocess(tweets)
print("Done creating corpus... " + str(datetime.now()))
# Find the best number of topics to use (this takes a while)
# There is also an error with this function: see https://stackoverflow.com/questions/63763610/get-coherence-function-error-in-python-while-using-lda
# find_good_num_topics(texts)
nmf, vectorizer, tweet_id_to_topic = create_nmf_model(tweets, corpus, 20)
# Plot top words
tfidf_feature_names = vectorizer.get_feature_names_out()
plot_top_words(
nmf, tfidf_feature_names, n_top_words=10, title="Topics in NMF model (Frobenius norm)"
)
# Relative Frequencies
relative_frequency_months(market)