-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
291 lines (238 loc) · 12.8 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import os.path as op
import pickle as pkl
import re
import numpy as np
import pandas as pd
from nlp_demo import do_lda, do_lemmatize, do_vectorize
from plotting import tsne_plotly
from registry_data import fetch_event_articles
def get_source_prop(source, prop_name):
"""Convert string to dict"""
try:
return eval(source).get(prop_name)
except:
return source
def clean_text(txt):
txt = re.sub(r'[^\sa-zA-Z]', '', txt)
txt = re.sub(r'\s+', ' ', txt)
return txt
def reduce_by_source(df, thresh=0.75, max_rows_per_article=np.inf):
df['source_title'] = df['source'].map(lambda s: get_source_prop(s, 'title'))
df['source_uri'] = df['source'].map(lambda s: get_source_prop(s, 'uri') or '')
n_events = len(df['eventUri'].unique())
total_srcs = df['source_title']
srcs_dict = {src: 0 for src in total_srcs}
# Figure out which news sources have enough event coverage (at least thresh %)
for group in df.groupby(by='eventUri'):
event_srcs = group[1]['source_title'].unique()
for src in event_srcs:
srcs_dict[src] += 1
# Drop null keys
srcs_with_cover = {key: value for key, value in srcs_dict.items()
if (value > n_events * thresh
and not pd.isnull(key or np.nan))}
print('total sources: {}\nsource with cover (>{}): {}'.format(
len(np.unique(total_srcs)), thresh, len(srcs_with_cover)))
# Only include
cover_articles = {}
dfs = []
df['body_len'] = df['body'].map(lambda b: len(b) if not pd.isnull(b) else 0)
df = df.sort_values(by=['body_len'], ascending=False)
for event_uri, df in df.groupby(by='eventUri'):
cover_articles[event_uri] = {}
for src in srcs_with_cover.keys():
src_rows = df.loc[df['source_title'] == src, :] # .sort_values(by='body_len', ascending=False)
if len(src_rows) >= 1 and max_rows_per_article < np.inf:
max_idx = int(np.min([
np.ceil(len(src_rows) / 2.),
np.min([max_rows_per_article, len(src_rows)])]))
assert max_idx > 0, "Never should reduce from an article to nothing."
# print "Reduce %s from %d to %d" % (src, len(src_rows), max_idx)
src_rows = src_rows.iloc[:max_idx]
dfs.append(src_rows)
article_df = pd.concat(dfs)
return article_df
def load_and_clean(csv_file, n_events=2, min_article_length=250, source_thresh=0.75, force=False,
eventregistry_api_key=None, min_articles=500, group_by='source'):
print("Loading and cleaning text...")
clean_csv_file = csv_file.replace('.csv', '-ev%s-minlen%d-thresh%.3f.clean.csv' % (
n_events if n_events < np.inf else 'All', min_article_length, source_thresh))
if not force and op.exists(clean_csv_file):
df = pd.read_csv(clean_csv_file)
event_uris = df['eventUri'].unique()
return df, event_uris
# Need to download data.
fetch_event_articles(
api_key=eventregistry_api_key, min_articles=min_articles,
csv_file=csv_file)
df = pd.read_csv(csv_file)
df['body'] = df['body'].apply(clean_text)
event_uris = df['eventUri'].unique()
# Reduce by article length
good_idx = df['body'].apply(
lambda b: not pd.isnull(b) and len(b) > min_article_length)
df = df[good_idx]
# Reduce by source.
df = reduce_by_source(
df, thresh=source_thresh,
max_rows_per_article=np.inf if group_by == 'source' else 1)
# Reduce by # events
if n_events and n_events < np.inf:
keep_event_uris = event_uris[:n_events]
new_df = df[df['eventUri'].apply(lambda uri: uri in keep_event_uris)]
assert n_events == len(new_df['eventUri'].unique())
print("\tReduced from %d to %d events, %d to %d articles." % (
len(event_uris), len(keep_event_uris), len(df), len(new_df)))
df = new_df
event_uris = keep_event_uris
df.to_csv(clean_csv_file, encoding='utf-8')
return df, event_uris
def vectorize_articles(df, event_uris, pkl_file, min_vocab_length=100,
lda_min_appearances=2, lda_vectorization_type='count', force=False):
print("Vectorizing data...")
if not force and op.exists(pkl_file):
with open(pkl_file, 'rb') as fp:
article_counts, vocab, vectorizer = pkl.load(fp)
else:
docs = df['body'].values
l_docs = do_lemmatize(docs)
article_counts, vocab, vectorizer = do_vectorize(
l_docs, min_df=lda_min_appearances, type=lda_vectorization_type)
with open(pkl_file, 'wb') as fp:
pkl.dump((article_counts, vocab, vectorizer), fp)
# Limit by vocab
good_article_idx = np.squeeze(np.asarray((article_counts > 0).sum(axis=1) >= min_vocab_length))
article_counts = article_counts[good_article_idx]
good_vocab_idx = np.squeeze(np.asarray(article_counts.sum(axis=0) > 0))
article_counts = article_counts[:, good_vocab_idx]
vocab = vocab[good_vocab_idx]
df = df.iloc[good_article_idx]
return article_counts, vocab, vectorizer, df
def group_article_counts_by_source(df, article_counts):
# Inputs:
# df - dataframe; one row per article. 'source' column defines the news source.
# article_counts - matrix; one row per article, one column per vocabulary item, value is the # of times
# that word appears in the article.
#
# Outputs: source_counts
# matrix; one row per news source, one column per vocabulary item, value is the # of times
# that word appears across all articles from that news source.
# TODO: group by news source.
sources = df['source_title'].unique()
print("Reorganizing %d articles by by %d sources..." % (article_counts.shape[0], len(sources)))
source_counts = []
for source in sources:
source_articles_idx = (df['source_title'] == source).values
source_counts.append(article_counts[source_articles_idx].sum(axis=0))
return np.squeeze(np.asarray(source_counts))
def model_articles(df, article_counts, vectorizer, vocab, event_uris, n_events=2,
truth_frequency_thresh=0.5, force=False, group_by='source',
n_topics=10, n_iter=1500, lda_vectorization_type='count'):
print("Training model ...")
# Now model
model_pkl = 'lda-model-groupBy%s-thresh%.2f-top%d-iter%d-ev%d.pkl' % (
group_by, truth_frequency_thresh, n_topics, n_iter, n_events)
if not force and op.exists(model_pkl):
# This error catch all isn't working correctly.
# the vocabulary from the articles are not assigned.
with open(model_pkl, 'rb') as fp:
lda_labels, lda_output_mat, lda_cats, lda_mat, model = pkl.load(fp)
else:
article_events = df['eventUri'].values
for uri in event_uris:
print uri
event_article_counts = article_counts[article_events == uri]
n_articles = event_article_counts.shape[0]
word_freq_over_articles = (event_article_counts > 0).sum(axis=0) / float(n_articles)
word_freq_over_articles = np.squeeze(np.asarray(word_freq_over_articles))
# Store common words for this event, then blank them out in the word counts
common_vocab_idx = word_freq_over_articles >= truth_frequency_thresh
article_count_idx = np.asmatrix(article_events == uri).T * np.asmatrix(common_vocab_idx)
article_counts[article_count_idx] = 0
# print '\tEvent vocab:', vocab[common_vocab_idx]
# print common_vocab_idx.sum(), article_counts.shape
if lda_vectorization_type == 'tfidf':
# Make sure lda_mat has valid values.
article_counts = (10 * article_counts).astype(int)
good_idx = article_counts.sum(axis=0) > 0
good_idx = np.reshape(np.asarray(good_idx), (good_idx.size,))
article_counts = article_counts[:, good_idx]
vocab = vocab[good_idx]
if group_by == 'source':
word_counts = group_article_counts_by_source(df=df, article_counts=article_counts)
else:
word_counts = article_counts
lda_labels, lda_output_mat, lda_cats, lda_mat, model = do_lda(
lda_mat=word_counts, vectorizer=vectorizer, vocab=vocab,
n_topics=n_topics, n_top_words=10, n_iter=n_iter, return_model=True)
with open(model_pkl, 'wb') as fp:
pkl.dump((lda_labels, lda_output_mat, lda_cats, lda_mat, model), fp)
return article_counts, lda_labels, lda_output_mat, lda_cats, lda_mat, model
def main(csv_file='raw_dataframe.csv', n_events=2, min_article_length=250,
force=False, min_vocab_length=100, min_articles=500, source_thresh=0.75,
lda_min_appearances=2, lda_vectorization_type='count',
lda_groupby='source', lda_iters=1500, lda_topics=10,
truth_frequency_thresh=0.5, points_per_category=500,
plotly_username=None, plotly_api_key=None, eventregistry_api_key=None):
"""
Do it all!
"""
# Note: to force re-download of event article info, you'll have to delete files manually.
# Too risky to pass a flag...
df, event_uris = load_and_clean(
csv_file=csv_file, min_articles=min_articles, eventregistry_api_key=eventregistry_api_key,
n_events=n_events, min_article_length=min_article_length, source_thresh=source_thresh,
group_by=lda_groupby)
n_events = n_events if n_events < np.inf else len(df['eventUri'].unique())
pkl_file = '%s-ev%s.pkl' % (csv_file.replace('.csv', ''), n_events)
article_counts, vocab, vectorizer, df = vectorize_articles(
df=df, event_uris=event_uris, pkl_file=pkl_file, force=force,
lda_min_appearances=lda_min_appearances, lda_vectorization_type=lda_vectorization_type,
min_vocab_length=min_vocab_length)
_, lda_labels, lda_output_mat, lda_cats, lda_mat, model = model_articles(
df=df, event_uris=event_uris, vectorizer=vectorizer, vocab=vocab,
article_counts=article_counts, force=force, n_events=n_events,
group_by=lda_groupby, n_topics=lda_topics, n_iter=lda_iters,
truth_frequency_thresh=truth_frequency_thresh,
lda_vectorization_type=lda_vectorization_type)
tsne_plotly(data=lda_output_mat, cat=lda_cats, labels=lda_labels,
source=df['source_title'], max_points_per_category=points_per_category,
username=plotly_username, api_key=plotly_api_key)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(
description='Train and visualize an LDA model on news article bias.')
# Training data
parser.add_argument('--force', action='store_true')
parser.add_argument('--csv-file', default=None)
parser.add_argument('--n-events', type=int, default=np.inf)
# Model parameters
parser.add_argument('--min-articles', type=int, default=500,
help='Min # articles per event, to keep the event')
parser.add_argument('--source-thresh', type=float, default=0.75,
help='Min %% of events a news source must cover, to be included.')
parser.add_argument('--min-article-length', type=int, default=250,
help='Min # words in an article (pre-parsing)')
parser.add_argument('--min-vocab-length', type=int, default=100,
help='Min # words in an article (post-lemmatizing, vectorizing)')
parser.add_argument('--lda-min-appearances', type=int, default=2,
help='Min # appearances of a word, to be included in the vocabulary')
parser.add_argument('--lda-vectorization-type', default='count', choices=('count', 'tfidf'),
help='Type of vectorization of article to word counts, to do.')
parser.add_argument('--lda-groupby', default='article', choices=('source', 'article'),
help='Run LDA on text separated by article, or by news source?')
parser.add_argument('--lda-topics', type=int, default=10,
help='# of LDA topics')
parser.add_argument('--lda-iters', type=int, default=1500,
help='# of LDA iterations')
parser.add_argument('--truth-frequency-thresh', type=float, default=0.5,
help='%% of articles in a news event that must mention a word, for it to be "truth" / removed.')
parser.add_argument('--points-per-category', type=int, default=500,
help='# points per category (scatter)')
# API info
parser.add_argument('--plotly-username', default='bakeralex664')
parser.add_argument('--plotly-api-key', default='hWwBstLnNCX5CsDZpOSU')
parser.add_argument('--eventregistry-api-key', default='8b86c30c-cb8f-4d3f-aa84-077f3090e5ba')
args = vars(parser.parse_args())
args['csv_file'] = args['csv_file'] or 'raw_dataframe-min%d.csv' % args['min_articles']
print main(**args)