Skip to content

Commit

Permalink
Added titles on the set of possible claims.
Browse files Browse the repository at this point in the history
  • Loading branch information
Panayiotis Smeros committed Feb 7, 2020
1 parent 3373662 commit 0ce0331
Show file tree
Hide file tree
Showing 4 changed files with 27 additions and 8 deletions.
19 changes: 19 additions & 0 deletions bin/configure.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

#!/bin/sh

sudo apt install -y openjdk-8-jre screen htop git vim

wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
chmod +x Miniconda3-latest-Linux-x86_64.sh
./Miniconda3-latest-Linux-x86_64.sh
source ~/.bashrc
rm -rf Miniconda3-latest-Linux-x86_64.sh

conda install -y pandas numpy networkx nltk spacy pyspark beautifulsoup4 scikit-learn
conda install pytorch cudatoolkit=9.0 -c pytorch
pip install -U newspaper3k textstat pandarallel simpletransformers
python -m nltk.downloader punkt vader_lexicon #-d /path/to/nltk_data
python -m spacy download en_core_web_lg

git clone https://github.com/rwalk/gsdmm; cd gsdmm; python setup.py install; cd ..; rm -rf gsdmm

3 changes: 1 addition & 2 deletions src/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,5 +232,4 @@ def popularity_clustering(learn_transform, iterations=1, top_k=5):
#disjoint_clustering(method='GMM', dimension=10)
#disjoint_clustering(method='KMeans', dimension=10)
#disjoint_clustering(method='KMeans')
popularity_clustering(learn_transform=False, iterations=2)
exit()
popularity_clustering(learn_transform=False, iterations=2)
10 changes: 5 additions & 5 deletions src/extracting.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import re
from pathlib import Path

import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd
import numpy as np
import pandas as pd
import spacy
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support
import re

############################### CONSTANTS ###############################
scilens_dir = str(Path.home()) + '/data/scilens/cache/diffusion_graph/scilens_3M/'
Expand Down Expand Up @@ -65,7 +63,7 @@ def prepare_eval_dataset(gold_agreement):
df = df.rename(columns={'Input.sentence':'sentence', 'Input.golden_label':'golden_label', 'Input.type':'type', 'Answer.claim.label':'label', 'LifetimeApprovalRate':'approval'})

df = df.dropna()
df = df[df.approval.apply(lambda x: int(re.sub('\%.*', '', x))) != 0]
df = df[df.approval.apply(lambda x: int(re.sub(r'\%.*', '', x))) != 0]

#aggregate results from crowdworkers
df = pd.DataFrame(df.groupby(['sentence', 'type', 'golden_label'])['label'].apply(lambda x: (lambda c: (c.index[0], 'strong') if c.get(0) - c.get(1, default=0) > 1 else (c.index[0], 'weak') if c.get(0) - c.get(1, default=0) == 1 else np.nan)(x.value_counts())).apply(pd.Series))
Expand Down Expand Up @@ -102,9 +100,11 @@ def pred_BERT(model):
model = ClassificationModel('bert', model, use_cuda=False)

articles = pd.read_csv(scilens_dir + 'article_details_v3.tsv.bz2', sep='\t')
titles = articles[['url', 'title']].drop_duplicates(subset='url').rename(columns={'title': 'claim'})
articles = articles[['url', 'quotes']].drop_duplicates(subset='url')
articles.quotes = articles.quotes.apply(lambda l: list(map(lambda d: d['quote'], eval(l))))
articles = articles.explode('quotes').rename(columns={'quotes': 'claim'})
articles = pd.concat([articles, titles])
articles = articles[~articles['claim'].isna()]

articles['label'], _ = model.predict(articles.claim)
Expand Down
3 changes: 2 additions & 1 deletion src/matrix_preparation.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def matrix_preparation(representations, pca_dimensions=None):
tweets = pd.read_csv(scilens_dir + 'tweet_details_v1.tsv.bz2', sep='\t').drop_duplicates(subset='url').set_index('url')
claims['popularity'] = claims.url.parallel_apply(lambda u: sum([tweets.loc[t]['popularity'] for t in G.predecessors(u) if t in tweets.index]))

claims.claim = claims.claim.apply(eval)
claims = claims.explode('claim')

claims['clean_claim'] = claims['claim'].parallel_apply(clean_claim)
Expand Down Expand Up @@ -107,4 +108,4 @@ def matrix_preparation(representations, pca_dimensions=None):


if __name__ == "__main__":
matrix_preparation(representations=['textual','embeddings'], pca_dimensions=[2])
matrix_preparation(representations=['textual','embeddings'], pca_dimensions=[10])

0 comments on commit 0ce0331

Please sign in to comment.