KaiDMML · aounleonardo · Nov 20, 2019 · Nov 20, 2019 · Nov 20, 2019 · Dec 5, 2019
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,133 @@
+*.out
+fakenewsnet_dataset
+
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
diff --git a/code/config.json → code/config_example.json b/code/config.json → code/config_example.json
diff --git a/code/resources/tweet_keys_file.json → code/resources/tweet_keys_file_example.json b/code/resources/tweet_keys_file.json → code/resources/tweet_keys_file_example.json
diff --git a/code/retweet_collection.py b/code/retweet_collection.py
@@ -1,33 +1,66 @@
 import json
 import logging
-from twython import TwythonError, TwythonRateLimitError
-
 
-from tweet_collection import Tweet
-from util.TwythonConnector import TwythonConnector
-from util.util import create_dir, Config, multiprocess_data_collection
+from twython import TwythonError, TwythonRateLimitError
 
-from util.util import DataCollector
 from util import Constants
-
-
-def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector):
+from util.TwythonConnector import TwythonConnector
+from util.util import (
+    Tweet,
+    Config,
+    DataCollector,
+    create_dir,
+    get_dump_dir,
+    is_file_exists,
+    multiprocess_data_collection,
+)
+
+
+def _should_skip_retweets(tweet: Tweet, dump_dir: str):
+    retweet_filename = "{}/retweets/{}.json".format(dump_dir, tweet.tweet_id)
+    if is_file_exists(retweet_filename):
+        return True
+    tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id)
+    if not is_file_exists(tweet_filename):
+        return True
+    return False
+
+
+def _should_fetch_retweets(tweet: Tweet, dump_dir: str):
+    tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id)
+    with open(tweet_filename) as file:
+        tweet_object = json.load(file)
+    return tweet_object.get("retweet_count", 0) > 0
+
+
+def dump_retweets_job(
+    tweet: Tweet, config: Config, twython_connector: TwythonConnector
+):
     retweets = []
     connection = None
-    try:
-        connection = twython_connector.get_twython_connection("get_retweet")
-        retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)
 
-    except TwythonRateLimitError:
-        logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id))
+    dump_dir = get_dump_dir(config, tweet)
+
+    if _should_fetch_retweets(tweet, dump_dir):
+        try:
+            connection = twython_connector.get_twython_connection(Constants.GET_RETWEET)
+            retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)
+
+        except TwythonRateLimitError:
+            logging.exception(
+                "Twython API rate limit exception - tweet id : {}".format(
+                    tweet.tweet_id
+                )
+            )
 
-    except Exception:
-        logging.exception(
-            "Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection))
+        except Exception:
+            logging.exception(
+                "Exception in getting retweets for tweet id %d using connection %s"
+                % (tweet.tweet_id, connection)
+            )
 
     retweet_obj = {"retweets": retweets}
 
-    dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
     retweet_dir = "{}/retweets".format(dump_dir)
     create_dir(dump_dir)
     create_dir(retweet_dir)
@@ -47,15 +80,27 @@ def collect_retweets(news_list, news_source, label, config: Config):
         for tweet_id in news.tweet_ids:
             tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label))
 
-    multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config)
+    filtered_tweet_id_list = [
+        tweet
+        for tweet in tweet_id_list
+        if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),)
+    ]
 
+    multiprocess_data_collection(
+        dump_retweets_job,
+        filtered_tweet_id_list,
+        (config, config.twython_connector),
+        config,
+    )
 
-class RetweetCollector(DataCollector):
 
+class RetweetCollector(DataCollector):
     def __init__(self, config):
         super(RetweetCollector, self).__init__(config)
 
     def collect_data(self, choices):
         for choice in choices:
             news_list = self.load_news_file(choice)
-            collect_retweets(news_list, choice["news_source"], choice["label"], self.config)
+            collect_retweets(
+                news_list, choice["news_source"], choice["label"], self.config
+            )
diff --git a/code/tweet_collection.py b/code/tweet_collection.py
@@ -13,15 +13,6 @@
 from util.util import equal_chunks
 
 
-class Tweet:
-
-    def __init__(self, tweet_id, news_id, news_source, label):
-        self.tweet_id = tweet_id
-        self.news_id = news_id
-        self.news_source = news_source
-        self.label = label
-
-
 def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector):
     """Collect info and dump info of tweet chunk containing atmost 100 tweets"""
 

diff --git a/code/util/util.py b/code/util/util.py
@@ -9,16 +9,24 @@
 from util.TwythonConnector import TwythonConnector
 
 
-class News:
+class Tweet:
+
+    def __init__(self, tweet_id, news_id, news_source, label):
+        self.tweet_id = tweet_id
+        self.news_id = news_id
+        self.news_source = news_source
+        self.label = label
+
 
+class News:
     def __init__(self, info_dict, label, news_platform):
         self.news_id = info_dict["id"]
         self.news_url = info_dict["news_url"]
         self.news_title = info_dict["title"]
-        self.tweet_ids =[]
+        self.tweet_ids = []
 
         try:
-            tweets =  [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")]
+            tweets = [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")]
             self.tweet_ids = tweets
         except:
             pass
@@ -27,9 +35,7 @@ def __init__(self, info_dict, label, news_platform):
         self.platform = news_platform
 
 
-
 class Config:
-
     def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process):
         self.dataset_dir = data_dir
         self.dump_location = data_collection_dir
@@ -39,9 +45,7 @@ def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process):
         self.twython_connector = TwythonConnector("localhost:5000", tweet_keys_file)
 
 
-
 class DataCollector:
-
     def __init__(self, config):
         self.config = config
 
@@ -60,11 +64,19 @@ def load_news_file(self, data_choice):
                 maxInt = int(maxInt / 10)
 
         news_list = []
-        with open('{}/{}_{}.csv'.format(self.config.dataset_dir, data_choice["news_source"],
-                                        data_choice["label"]), encoding="UTF-8") as csvfile:
+        with open(
+            "{}/{}_{}.csv".format(
+                self.config.dataset_dir,
+                data_choice["news_source"],
+                data_choice["label"],
+            ),
+            encoding="UTF-8",
+        ) as csvfile:
             reader = csv.DictReader(csvfile)
             for news in reader:
-                news_list.append(News(news, data_choice["label"], data_choice["news_source"]))
+                news_list.append(
+                    News(news, data_choice["label"], data_choice["news_source"])
+                )
 
         return news_list
 
@@ -78,15 +90,25 @@ def create_dir(dir_name):
                 raise
 
 
+def get_dump_dir(config: Config, tweet: Tweet) -> str:
+    return "{}/{}/{}/{}".format(
+        config.dump_location, tweet.news_source, tweet.label, tweet.news_id
+    )
+
+
 def is_folder_exists(folder_name):
     return os.path.exists(folder_name)
 
 
+def is_file_exists(file_name):
+    return os.path.exists(file_name)
+
+
 def equal_chunks(list, chunk_size):
     """return successive n-sized chunks from l."""
     chunks = []
     for i in range(0, len(list), chunk_size):
-        chunks.append(list[i:i + chunk_size])
+        chunks.append(list[i : i + chunk_size])
 
     return chunks
 
@@ -101,7 +123,9 @@ def update(arg):
         pbar.update()
 
     for i in range(pbar.total):
-        pool.apply_async(function_reference, args=(data_list[i],)+ args, callback=update)
+        pool.apply_async(
+            function_reference, args=(data_list[i],) + args, callback=update
+        )
 
     pool.close()
     pool.join()