From 231d984d126aec204ceb5e62118be724c7f7900b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-25-28.us-east-2.compute.internal>
Date: Wed, 20 Nov 2019 17:00:13 +0000
Subject: [PATCH 1/4] codemod(git) untrack user-sensitive files

---
 .gitignore                                    | 133 ++++++++++++++++++
 code/{config.json => config_example.json}     |   0
 ...file.json => tweet_keys_file_example.json} |   0
 3 files changed, 133 insertions(+)
 create mode 100644 .gitignore
 rename code/{config.json => config_example.json} (100%)
 rename code/resources/{tweet_keys_file.json => tweet_keys_file_example.json} (100%)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..ddeac2ac6
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,133 @@
+*.out
+fakenewsnet_dataset
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/code/config.json b/code/config_example.json
similarity index 100%
rename from code/config.json
rename to code/config_example.json
diff --git a/code/resources/tweet_keys_file.json b/code/resources/tweet_keys_file_example.json
similarity index 100%
rename from code/resources/tweet_keys_file.json
rename to code/resources/tweet_keys_file_example.json

From 412a525f14850988b4b6aae45bd7343e3b66ee71 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-25-28.us-east-2.compute.internal>
Date: Wed, 20 Nov 2019 18:09:06 +0000
Subject: [PATCH 2/4] feat(speed) only get retweets for tweets that have more
 than 0 retweet_count

---
 code/retweet_collection.py | 18 ++++++++++++++----
 code/util/util.py          |  2 ++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/code/retweet_collection.py b/code/retweet_collection.py
index 43596be1c..616b729a5 100644
--- a/code/retweet_collection.py
+++ b/code/retweet_collection.py
@@ -5,18 +5,29 @@
 
 from tweet_collection import Tweet
 from util.TwythonConnector import TwythonConnector
-from util.util import create_dir, Config, multiprocess_data_collection
+from util.util import create_dir, Config, multiprocess_data_collection, is_file_exists
 
 from util.util import DataCollector
 from util import Constants
 
 
+def _should_fetch_retweets(tweet: Tweet, dump_dir):
+    tweet_filename = "{}/{}.json".format(dump_dir, tweet.tweet_id)
+    if not is_file_exists(tweet_filename):
+        return True
+    with open(tweet_filename) as file:
+        tweet_object = json.load(file)
+    return tweet_object.get("retweet_count", 0) > 0
+
 def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector):
     retweets = []
     connection = None
+
+    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
     try:
-        connection = twython_connector.get_twython_connection("get_retweet")
-        retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)
+        connection = twython_connector.get_twython_connection(Constants.GET_RETWEET)
+        if _should_fetch_retweets(tweet, dump_dir):
+            retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)
 
     except TwythonRateLimitError:
         logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id))
@@ -27,7 +38,6 @@ def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonCo
 
     retweet_obj = {"retweets": retweets}
 
-    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
     retweet_dir = "{}/retweets".format(dump_dir)
     create_dir(dump_dir)
     create_dir(retweet_dir)
diff --git a/code/util/util.py b/code/util/util.py
index 451ff2c76..3654bb2a4 100644
--- a/code/util/util.py
+++ b/code/util/util.py
@@ -81,6 +81,8 @@ def create_dir(dir_name):
 def is_folder_exists(folder_name):
     return os.path.exists(folder_name)
 
+def is_file_exists(file_name):
+    return os.path.exists(file_name)
 
 def equal_chunks(list, chunk_size):
     """return successive n-sized chunks from l."""

From 96eed0be3e7b8cb6fda1e28aa1241d7886637ea6 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-25-28.us-east-2.compute.internal>
Date: Wed, 20 Nov 2019 18:51:02 +0000
Subject: [PATCH 3/4] fix(retweet) remove creation of connection if retweet
 fetch is not needed

---
 code/retweet_collection.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/code/retweet_collection.py b/code/retweet_collection.py
index 616b729a5..7b7e236b7 100644
--- a/code/retweet_collection.py
+++ b/code/retweet_collection.py
@@ -24,17 +24,18 @@ def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonCo
     connection = None
 
     dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
-    try:
-        connection = twython_connector.get_twython_connection(Constants.GET_RETWEET)
-        if _should_fetch_retweets(tweet, dump_dir):
+
+    if _should_fetch_retweets(tweet, dump_dir):
+        try:
+            connection = twython_connector.get_twython_connection(Constants.GET_RETWEET)
             retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)
 
-    except TwythonRateLimitError:
-        logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id))
+        except TwythonRateLimitError:
+            logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id))
 
-    except Exception:
-        logging.exception(
-            "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection))
+        except Exception:
+            logging.exception(
+                "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection))
 
     retweet_obj = {"retweets": retweets}
 

From a10f992910e47f887ab768e9c0de89d02e1fc823 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-25-28.us-east-2.compute.internal>
Date: Thu, 5 Dec 2019 10:10:22 +0000
Subject: [PATCH 4/4] differentiate between should_fetch_retweet and
 should_skip_retweet

---
 code/retweet_collection.py | 68 ++++++++++++++++++++++++++++----------
 code/tweet_collection.py   |  9 -----
 code/util/util.py          | 46 +++++++++++++++++++-------
 3 files changed, 85 insertions(+), 38 deletions(-)

diff --git a/code/retweet_collection.py b/code/retweet_collection.py
index 7b7e236b7..0cbed5d13 100644
--- a/code/retweet_collection.py
+++ b/code/retweet_collection.py
@@ -1,29 +1,45 @@
 import json
 import logging
-from twython import TwythonError, TwythonRateLimitError
-
 
-from tweet_collection import Tweet
-from util.TwythonConnector import TwythonConnector
-from util.util import create_dir, Config, multiprocess_data_collection, is_file_exists
+from twython import TwythonError, TwythonRateLimitError
 
-from util.util import DataCollector
 from util import Constants
-
-
-def _should_fetch_retweets(tweet: Tweet, dump_dir):
-    tweet_filename = "{}/{}.json".format(dump_dir, tweet.tweet_id)
+from util.TwythonConnector import TwythonConnector
+from util.util import (
+    Tweet,
+    Config,
+    DataCollector,
+    create_dir,
+    get_dump_dir,
+    is_file_exists,
+    multiprocess_data_collection,
+)
+
+
+def _should_skip_retweets(tweet: Tweet, dump_dir: str):
+    retweet_filename = "{}/retweets/{}.json".format(dump_dir, tweet.tweet_id)
+    if is_file_exists(retweet_filename):
+        return True
+    tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id)
     if not is_file_exists(tweet_filename):
         return True
+    return False
+
+
+def _should_fetch_retweets(tweet: Tweet, dump_dir: str):
+    tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id)
     with open(tweet_filename) as file:
         tweet_object = json.load(file)
     return tweet_object.get("retweet_count", 0) > 0
 
-def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector):
+
+def dump_retweets_job(
+    tweet: Tweet, config: Config, twython_connector: TwythonConnector
+):
     retweets = []
     connection = None
 
-    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
+    dump_dir = get_dump_dir(config, tweet)
 
     if _should_fetch_retweets(tweet, dump_dir):
         try:
@@ -31,11 +47,17 @@ def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonCo
             retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)
 
         except TwythonRateLimitError:
-            logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id))
+            logging.exception(
+                "Twython API rate limit exception - tweet id : {}".format(
+                    tweet.tweet_id
+                )
+            )
 
         except Exception:
             logging.exception(
-                "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection))
+                "Exception in getting retweets for tweet id %d using connection %s"
+                % (tweet.tweet_id, connection)
+            )
 
     retweet_obj = {"retweets": retweets}
 
@@ -58,15 +80,27 @@ def collect_retweets(news_list, news_source, label, config: Config):
         for tweet_id in news.tweet_ids:
             tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label))
 
-    multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config)
+    filtered_tweet_id_list = [
+        tweet
+        for tweet in tweet_id_list
+        if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),)
+    ]
 
+    multiprocess_data_collection(
+        dump_retweets_job,
+        filtered_tweet_id_list,
+        (config, config.twython_connector),
+        config,
+    )
 
-class RetweetCollector(DataCollector):
 
+class RetweetCollector(DataCollector):
     def __init__(self, config):
         super(RetweetCollector, self).__init__(config)
 
     def collect_data(self, choices):
         for choice in choices:
             news_list = self.load_news_file(choice)
-            collect_retweets(news_list, choice["news_source"], choice["label"], self.config)
+            collect_retweets(
+                news_list, choice["news_source"], choice["label"], self.config
+            )
diff --git a/code/tweet_collection.py b/code/tweet_collection.py
index 7c4c5f0e3..0506e2957 100644
--- a/code/tweet_collection.py
+++ b/code/tweet_collection.py
@@ -13,15 +13,6 @@
 from util.util import equal_chunks
 
 
-class Tweet:
-
-    def __init__(self, tweet_id, news_id, news_source, label):
-        self.tweet_id = tweet_id
-        self.news_id = news_id
-        self.news_source = news_source
-        self.label = label
-
-
 def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector):
     """Collect info and dump info of tweet chunk containing atmost 100 tweets"""
 
diff --git a/code/util/util.py b/code/util/util.py
index 3654bb2a4..91d49f39d 100644
--- a/code/util/util.py
+++ b/code/util/util.py
@@ -9,16 +9,24 @@
 from util.TwythonConnector import TwythonConnector
 
 
-class News:
+class Tweet:
+
+    def __init__(self, tweet_id, news_id, news_source, label):
+        self.tweet_id = tweet_id
+        self.news_id = news_id
+        self.news_source = news_source
+        self.label = label
+
 
+class News:
     def __init__(self, info_dict, label, news_platform):
         self.news_id = info_dict["id"]
         self.news_url = info_dict["news_url"]
         self.news_title = info_dict["title"]
-        self.tweet_ids =[]
+        self.tweet_ids = []
 
         try:
-            tweets =  [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")]
+            tweets = [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")]
             self.tweet_ids = tweets
         except:
             pass
@@ -27,9 +35,7 @@ def __init__(self, info_dict, label, news_platform):
         self.platform = news_platform
 
 
-
 class Config:
-
     def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process):
         self.dataset_dir = data_dir
         self.dump_location = data_collection_dir
@@ -39,9 +45,7 @@ def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process):
         self.twython_connector = TwythonConnector("localhost:5000", tweet_keys_file)
 
 
-
 class DataCollector:
-
     def __init__(self, config):
         self.config = config
 
@@ -60,11 +64,19 @@ def load_news_file(self, data_choice):
                 maxInt = int(maxInt / 10)
 
         news_list = []
-        with open('{}/{}_{}.csv'.format(self.config.dataset_dir, data_choice["news_source"],
-                                        data_choice["label"]), encoding="UTF-8") as csvfile:
+        with open(
+            "{}/{}_{}.csv".format(
+                self.config.dataset_dir,
+                data_choice["news_source"],
+                data_choice["label"],
+            ),
+            encoding="UTF-8",
+        ) as csvfile:
             reader = csv.DictReader(csvfile)
             for news in reader:
-                news_list.append(News(news, data_choice["label"], data_choice["news_source"]))
+                news_list.append(
+                    News(news, data_choice["label"], data_choice["news_source"])
+                )
 
         return news_list
 
@@ -78,17 +90,25 @@ def create_dir(dir_name):
                 raise
 
 
+def get_dump_dir(config: Config, tweet: Tweet) -> str:
+    return "{}/{}/{}/{}".format(
+        config.dump_location, tweet.news_source, tweet.label, tweet.news_id
+    )
+
+
 def is_folder_exists(folder_name):
     return os.path.exists(folder_name)
 
+
 def is_file_exists(file_name):
     return os.path.exists(file_name)
 
+
 def equal_chunks(list, chunk_size):
     """return successive n-sized chunks from l."""
     chunks = []
     for i in range(0, len(list), chunk_size):
-        chunks.append(list[i:i + chunk_size])
+        chunks.append(list[i : i + chunk_size])
 
     return chunks
 
@@ -103,7 +123,9 @@ def update(arg):
         pbar.update()
 
     for i in range(pbar.total):
-        pool.apply_async(function_reference, args=(data_list[i],)+ args, callback=update)
+        pool.apply_async(
+            function_reference, args=(data_list[i],) + args, callback=update
+        )
 
     pool.close()
     pool.join()