Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Skip retweets for tweets with no retweets #38

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
133 changes: 133 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
*.out
fakenewsnet_dataset


# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
File renamed without changes.
87 changes: 66 additions & 21 deletions code/retweet_collection.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,66 @@
import json
import logging
from twython import TwythonError, TwythonRateLimitError


from tweet_collection import Tweet
from util.TwythonConnector import TwythonConnector
from util.util import create_dir, Config, multiprocess_data_collection
from twython import TwythonError, TwythonRateLimitError

from util.util import DataCollector
from util import Constants


def dump_retweets_job(tweet: Tweet, config: Config, twython_connector: TwythonConnector):
from util.TwythonConnector import TwythonConnector
from util.util import (
Tweet,
Config,
DataCollector,
create_dir,
get_dump_dir,
is_file_exists,
multiprocess_data_collection,
)


def _should_skip_retweets(tweet: Tweet, dump_dir: str):
retweet_filename = "{}/retweets/{}.json".format(dump_dir, tweet.tweet_id)
if is_file_exists(retweet_filename):
return True
tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id)
if not is_file_exists(tweet_filename):
return True
return False


def _should_fetch_retweets(tweet: Tweet, dump_dir: str):
tweet_filename = "{}/tweets/{}.json".format(dump_dir, tweet.tweet_id)
with open(tweet_filename) as file:
tweet_object = json.load(file)
return tweet_object.get("retweet_count", 0) > 0


def dump_retweets_job(
tweet: Tweet, config: Config, twython_connector: TwythonConnector
):
retweets = []
connection = None
try:
connection = twython_connector.get_twython_connection("get_retweet")
retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)

except TwythonRateLimitError:
logging.exception("Twython API rate limit exception - tweet id : {}".format(tweet.tweet_id))
dump_dir = get_dump_dir(config, tweet)

if _should_fetch_retweets(tweet, dump_dir):
try:
connection = twython_connector.get_twython_connection(Constants.GET_RETWEET)
retweets = connection.get_retweets(id=tweet.tweet_id, count=100, cursor=-1)

except TwythonRateLimitError:
logging.exception(
"Twython API rate limit exception - tweet id : {}".format(
tweet.tweet_id
)
)

except Exception:
logging.exception(
"Exception in getting retweets for tweet id %d using connection %s" % (tweet.tweet_id, connection))
except Exception:
logging.exception(
"Exception in getting retweets for tweet id %d using connection %s"
% (tweet.tweet_id, connection)
)

retweet_obj = {"retweets": retweets}

dump_dir = "{}/{}/{}/{}".format(config.dump_location, tweet.news_source, tweet.label, tweet.news_id)
retweet_dir = "{}/retweets".format(dump_dir)
create_dir(dump_dir)
create_dir(retweet_dir)
Expand All @@ -47,15 +80,27 @@ def collect_retweets(news_list, news_source, label, config: Config):
for tweet_id in news.tweet_ids:
tweet_id_list.append(Tweet(tweet_id, news.news_id, news_source, label))

multiprocess_data_collection(dump_retweets_job, tweet_id_list, (config, config.twython_connector), config)
filtered_tweet_id_list = [
tweet
for tweet in tweet_id_list
if not _should_skip_retweets(tweet, get_dump_dir(config, tweet),)
]

multiprocess_data_collection(
dump_retweets_job,
filtered_tweet_id_list,
(config, config.twython_connector),
config,
)

class RetweetCollector(DataCollector):

class RetweetCollector(DataCollector):
def __init__(self, config):
super(RetweetCollector, self).__init__(config)

def collect_data(self, choices):
for choice in choices:
news_list = self.load_news_file(choice)
collect_retweets(news_list, choice["news_source"], choice["label"], self.config)
collect_retweets(
news_list, choice["news_source"], choice["label"], self.config
)
9 changes: 0 additions & 9 deletions code/tweet_collection.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,6 @@
from util.util import equal_chunks


class Tweet:

def __init__(self, tweet_id, news_id, news_source, label):
self.tweet_id = tweet_id
self.news_id = news_id
self.news_source = news_source
self.label = label


def dump_tweet_information(tweet_chunk: list, config: Config, twython_connector: TwythonConnector):
"""Collect info and dump info of tweet chunk containing atmost 100 tweets"""

Expand Down
48 changes: 36 additions & 12 deletions code/util/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,24 @@
from util.TwythonConnector import TwythonConnector


class News:
class Tweet:

def __init__(self, tweet_id, news_id, news_source, label):
self.tweet_id = tweet_id
self.news_id = news_id
self.news_source = news_source
self.label = label


class News:
def __init__(self, info_dict, label, news_platform):
self.news_id = info_dict["id"]
self.news_url = info_dict["news_url"]
self.news_title = info_dict["title"]
self.tweet_ids =[]
self.tweet_ids = []

try:
tweets = [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")]
tweets = [int(tweet_id) for tweet_id in info_dict["tweet_ids"].split("\t")]
self.tweet_ids = tweets
except:
pass
Expand All @@ -27,9 +35,7 @@ def __init__(self, info_dict, label, news_platform):
self.platform = news_platform



class Config:

def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process):
self.dataset_dir = data_dir
self.dump_location = data_collection_dir
Expand All @@ -39,9 +45,7 @@ def __init__(self, data_dir, data_collection_dir, tweet_keys_file, num_process):
self.twython_connector = TwythonConnector("localhost:5000", tweet_keys_file)



class DataCollector:

def __init__(self, config):
self.config = config

Expand All @@ -60,11 +64,19 @@ def load_news_file(self, data_choice):
maxInt = int(maxInt / 10)

news_list = []
with open('{}/{}_{}.csv'.format(self.config.dataset_dir, data_choice["news_source"],
data_choice["label"]), encoding="UTF-8") as csvfile:
with open(
"{}/{}_{}.csv".format(
self.config.dataset_dir,
data_choice["news_source"],
data_choice["label"],
),
encoding="UTF-8",
) as csvfile:
reader = csv.DictReader(csvfile)
for news in reader:
news_list.append(News(news, data_choice["label"], data_choice["news_source"]))
news_list.append(
News(news, data_choice["label"], data_choice["news_source"])
)

return news_list

Expand All @@ -78,15 +90,25 @@ def create_dir(dir_name):
raise


def get_dump_dir(config: Config, tweet: Tweet) -> str:
return "{}/{}/{}/{}".format(
config.dump_location, tweet.news_source, tweet.label, tweet.news_id
)


def is_folder_exists(folder_name):
return os.path.exists(folder_name)


def is_file_exists(file_name):
return os.path.exists(file_name)


def equal_chunks(list, chunk_size):
"""return successive n-sized chunks from l."""
chunks = []
for i in range(0, len(list), chunk_size):
chunks.append(list[i:i + chunk_size])
chunks.append(list[i : i + chunk_size])

return chunks

Expand All @@ -101,7 +123,9 @@ def update(arg):
pbar.update()

for i in range(pbar.total):
pool.apply_async(function_reference, args=(data_list[i],)+ args, callback=update)
pool.apply_async(
function_reference, args=(data_list[i],) + args, callback=update
)

pool.close()
pool.join()