From 231d984d126aec204ceb5e62118be724c7f7900b Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 20 Nov 2019 17:00:13 +0000 Subject: [PATCH 1/4] codemod(git) untrack user-sensitive files --- .gitignore | 133 ++++++++++++++++++ code/{config.json => config_example.json} | 0 ...file.json => tweet_keys_file_example.json} | 0 3 files changed, 133 insertions(+) create mode 100644 .gitignore rename code/{config.json => config_example.json} (100%) rename code/resources/{tweet_keys_file.json => tweet_keys_file_example.json} (100%) diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..ddeac2ac6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,133 @@ +*.out +fakenewsnet_dataset + + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/code/config.json b/code/config_example.json similarity index 100% rename from code/config.json rename to code/config_example.json diff --git a/code/resources/tweet_keys_file.json b/code/resources/tweet_keys_file_example.json similarity index 100% rename from code/resources/tweet_keys_file.json rename to code/resources/tweet_keys_file_example.json From 412a525f14850988b4b6aae45bd7343e3b66ee71 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 20 Nov 2019 18:09:06 +0000 Subject: [PATCH 2/4] feat(speed) only get retweets for tweets that have more than 0 retweet_count --- code/retweet_collection.py | 18 ++++++++++++++---- code/util/util.py | 2 ++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/code/retweet_collection.py b/code/retweet_collection.py index 43596be1c..616b729a5 100644 --- a/code/retweet_collection.py +++ b/code/retweet_collection.py @@ -5,18 +5,29 @@ from tweet_collection import Tweet from util.TwythonConnector import TwythonConnector -from util.util import create_dir, Config, multiprocess_data_collection +from util.util import create_dir, Config, multiprocess_data_collection, is_file_exists from util.util import DataCollector from util import Constants +def _should_fetch_retweets(tweet: Tweet, dump_dir): + tweet_filename = "{}/{}.json".format(dump_dir, tweet.tweet_id) + if not is_file_exists(tweet_filename): + return True + with open(tweet_filename) as file: + tweet_object = json.load(file) + return tweet_object.get("retweet_count", 0) > 0 + def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): retweets = [] connection = None + + dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) try: - connection = twython_connector.get_twython_connection("get_retweet") - retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) + connection = twython_connector.get_twython_connection(Constants.GET_RETWEET) + if _should_fetch_retweets(tweet, dump_dir): + retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id)) @@ -27,7 +38,6 @@ def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonCo retweet_obj = {"retweets": retweets} - dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) retweet_dir = "{}/retweets".format(dump_dir) create_dir(dump_dir) create_dir(retweet_dir) diff --git a/code/util/util.py b/code/util/util.py index 451ff2c76..3654bb2a4 100644 --- a/code/util/util.py +++ b/code/util/util.py @@ -81,6 +81,8 @@ def create_dir(dir_name): def is_folder_exists(folder_name): return os.path.exists(folder_name) +def is_file_exists(file_name): + return os.path.exists(file_name) def equal_chunks(list, chunk_size): """return successive n-sized chunks from l.""" From 96eed0be3e7b8cb6fda1e28aa1241d7886637ea6 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Wed, 20 Nov 2019 18:51:02 +0000 Subject: [PATCH 3/4] fix(retweet) remove creation of connection if retweet fetch is not needed --- code/retweet_collection.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/code/retweet_collection.py b/code/retweet_collection.py index 616b729a5..7b7e236b7 100644 --- a/code/retweet_collection.py +++ b/code/retweet_collection.py @@ -24,17 +24,18 @@ def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonCo connection = None dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) - try: - connection = twython_connector.get_twython_connection(Constants.GET_RETWEET) - if _should_fetch_retweets(tweet, dump_dir): + + if _should_fetch_retweets(tweet, dump_dir): + try: + connection = twython_connector.get_twython_connection(Constants.GET_RETWEET) retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) - except TwythonRateLimitError: - logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id)) + except TwythonRateLimitError: + logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id)) - except Exception: - logging.exception( - "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) + except Exception: + logging.exception( + "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) retweet_obj = {"retweets": retweets} From a10f992910e47f887ab768e9c0de89d02e1fc823 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 5 Dec 2019 10:10:22 +0000 Subject: [PATCH 4/4] differentiate between should_fetch_retweet and should_skip_retweet --- code/retweet_collection.py | 68 ++++++++++++++++++++++++++++---------- code/tweet_collection.py | 9 ----- code/util/util.py | 46 +++++++++++++++++++------- 3 files changed, 85 insertions(+), 38 deletions(-) diff --git a/code/retweet_collection.py b/code/retweet_collection.py index 7b7e236b7..0cbed5d13 100644 --- a/code/retweet_collection.py +++ b/code/retweet_collection.py @@ -1,29 +1,45 @@ import json import logging -from twython import TwythonError, TwythonRateLimitError - -from tweet_collection import Tweet -from util.TwythonConnector import TwythonConnector -from util.util import create_dir, Config, multiprocess_data_collection, is_file_exists +from twython import TwythonError, TwythonRateLimitError -from util.util import DataCollector from util import Constants - - -def _should_fetch_retweets(tweet: Tweet, dump_dir): - tweet_filename = "{}/{}.json".format(dump_dir, tweet.tweet_id) +from util.TwythonConnector import TwythonConnector +from util.util import ( + Tweet, + Config, + DataCollector, + create_dir, + get_dump_dir, + is_file_exists, + multiprocess_data_collection, +) + + +def _should_skip_retweets(tweet: Tweet, dump_dir: str): + retweet_filename = "{}/retweets/{}.json".format(dump_dir, tweet.tweet_id) + if is_file_exists(retweet_filename): + return True + tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id) if not is_file_exists(tweet_filename): return True + return False + + +def _should_fetch_retweets(tweet: Tweet, dump_dir: str): + tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id) with open(tweet_filename) as file: tweet_object = json.load(file) return tweet_object.get("retweet_count", 0) > 0 -def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector): + +def dump_retweets_job( + tweet: Tweet, config: Config, twython_connector: TwythonConnector +): retweets = [] connection = None - dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id) + dump_dir = get_dump_dir(config, tweet) if _should_fetch_retweets(tweet, dump_dir): try: @@ -31,11 +47,17 @@ def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonCo retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1) except TwythonRateLimitError: - logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id)) + logging.exception( + "Twython API rate limit exception - tweet id : {}".format( + tweet.tweet_id + ) + ) except Exception: logging.exception( - "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection)) + "Exception in getting retweets for tweet id %d using connection %s" + % (tweet.tweet_id, connection) + ) retweet_obj = {"retweets": retweets} @@ -58,15 +80,27 @@ def collect_retweets(news_list, news_source, label, config: Config): for tweet_id in news.tweet_ids: tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label)) - multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config) + filtered_tweet_id_list = [ + tweet + for tweet in tweet_id_list + if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),) + ] + multiprocess_data_collection( + dump_retweets_job, + filtered_tweet_id_list, + (config, config.twython_connector), + config, + ) -class RetweetCollector(DataCollector): +class RetweetCollector(DataCollector): def __init__(self, config): super(RetweetCollector, self).__init__(config) def collect_data(self, choices): for choice in choices: news_list = self.load_news_file(choice) - collect_retweets(news_list, choice["news_source"], choice["label"], self.config) + collect_retweets( + news_list, choice["news_source"], choice["label"], self.config + ) diff --git a/code/tweet_collection.py b/code/tweet_collection.py index 7c4c5f0e3..0506e2957 100644 --- a/code/tweet_collection.py +++ b/code/tweet_collection.py @@ -13,15 +13,6 @@ from util.util import equal_chunks -class Tweet: - - def __init__(self, tweet_id, news_id, news_source, label): - self.tweet_id = tweet_id - self.news_id = news_id - self.news_source = news_source - self.label = label - - def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector): """Collect info and dump info of tweet chunk containing atmost 100 tweets""" diff --git a/code/util/util.py b/code/util/util.py index 3654bb2a4..91d49f39d 100644 --- a/code/util/util.py +++ b/code/util/util.py @@ -9,16 +9,24 @@ from util.TwythonConnector import TwythonConnector -class News: +class Tweet: + + def __init__(self, tweet_id, news_id, news_source, label): + self.tweet_id = tweet_id + self.news_id = news_id + self.news_source = news_source + self.label = label + +class News: def __init__(self, info_dict, label, news_platform): self.news_id = info_dict["id"] self.news_url = info_dict["news_url"] self.news_title = info_dict["title"] - self.tweet_ids =[] + self.tweet_ids = [] try: - tweets = [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")] + tweets = [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")] self.tweet_ids = tweets except: pass @@ -27,9 +35,7 @@ def __init__(self, info_dict, label, news_platform): self.platform = news_platform - class Config: - def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process): self.dataset_dir = data_dir self.dump_location = data_collection_dir @@ -39,9 +45,7 @@ def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process): self.twython_connector = TwythonConnector("localhost:5000", tweet_keys_file) - class DataCollector: - def __init__(self, config): self.config = config @@ -60,11 +64,19 @@ def load_news_file(self, data_choice): maxInt = int(maxInt / 10) news_list = [] - with open('{}/{}_{}.csv'.format(self.config.dataset_dir, data_choice["news_source"], - data_choice["label"]), encoding="UTF-8") as csvfile: + with open( + "{}/{}_{}.csv".format( + self.config.dataset_dir, + data_choice["news_source"], + data_choice["label"], + ), + encoding="UTF-8", + ) as csvfile: reader = csv.DictReader(csvfile) for news in reader: - news_list.append(News(news, data_choice["label"], data_choice["news_source"])) + news_list.append( + News(news, data_choice["label"], data_choice["news_source"]) + ) return news_list @@ -78,17 +90,25 @@ def create_dir(dir_name): raise +def get_dump_dir(config: Config, tweet: Tweet) -> str: + return "{}/{}/{}/{}".format( + config.dump_location, tweet.news_source, tweet.label, tweet.news_id + ) + + def is_folder_exists(folder_name): return os.path.exists(folder_name) + def is_file_exists(file_name): return os.path.exists(file_name) + def equal_chunks(list, chunk_size): """return successive n-sized chunks from l.""" chunks = [] for i in range(0, len(list), chunk_size): - chunks.append(list[i:i + chunk_size]) + chunks.append(list[i : i + chunk_size]) return chunks @@ -103,7 +123,9 @@ def update(arg): pbar.update() for i in range(pbar.total): - pool.apply_async(function_reference, args=(data_list[i],)+ args, callback=update) + pool.apply_async( + function_reference, args=(data_list[i],) + args, callback=update + ) pool.close() pool.join()