From c7a2598e1a4b963ff6990b6e7032af6434e0bcb4 Mon Sep 17 00:00:00 2001 From: Mincka Date: Sat, 30 Sep 2017 11:56:43 +0200 Subject: [PATCH] Fixes #27 Adds support for Twitter's paginate feature Adds raw output for authentication and thread list retrieval --- README.md | 13 +++---------- dmarchiver/cmdline.py | 15 +++++++++------ dmarchiver/core.py | 43 +++++++++++++++++++++++++++++++++++++------ 3 files changed, 49 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index e699d3b..931cd7d 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ By running the tool without any argument, you will be only prompted for your use Download a Windows build from the [project releases](https://github.com/Mincka/DMArchiver/releases). -Unzip the archive in a temporary folder and double-click the executable or run it in a Command Prompt: +Unzip the archive in a temporary folder and double-click the executable or run it in a Command Prompt (mandatory if you want to use parameters to download images and videos): ``` > C:\Temp\DMArchiver.exe ``` @@ -49,7 +49,7 @@ Note: If you run the tool directly from the zip archive window, it may fail when Download a macOS build from the [project releases](https://github.com/Mincka/DMArchiver/releases). -Then click on the executable, or run Terminal and execute the following commands: +Then click on the executable, or run Terminal and execute the following commands (mandatory if you want to use parameters to download images and videos): ``` $ cd Downloads $ ./dmarchiver @@ -143,14 +143,6 @@ You can also specify the username and the password in the options. Because DMArc $ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password ``` -### Module import -```python ->>> from dmarchiver.core import Crawler ->>> crawler = Crawler() ->>> crawler.authenticate('username', 'password') ->>> crawler.crawl('conversation_id') -``` - ## Development ### Ubuntu / Windows @@ -161,6 +153,7 @@ $ cd DMArchiver $ virtualenv venv $ source venv/bin/activate # "venv/Scripts/Activate.bat" on Windows $ pip install -r requirements.txt +$ python -m dmarchiver.cmdline ``` ### Mac OS X / macOS diff --git a/dmarchiver/cmdline.py b/dmarchiver/cmdline.py index ad9a25b..ceaf8a2 100644 --- a/dmarchiver/cmdline.py +++ b/dmarchiver/cmdline.py @@ -24,9 +24,12 @@ import argparse import getpass import sys -from dmarchiver import __version__ -from dmarchiver.core import Crawler - +if __name__ == '__main__': + from dmarchiver import __version__ + from dmarchiver.core import Crawler +else: + from .__init__ import __version__ + from .core import Crawler def main(): print("DMArchiver {0}".format(__version__)) @@ -72,7 +75,7 @@ def main(): crawler = Crawler() try: - crawler.authenticate(username, password) + crawler.authenticate(username, password, args.raw_output) except PermissionError as err: print('Error: {0}'.format(err.args[0])) print('Exiting.') @@ -90,10 +93,10 @@ def main(): crawler.crawl( conversation_id, args.download_images, - args.download_gifs, args.raw_output) + args.download_gifs, args.download_videos, args.raw_output) else: print('Conversation ID not specified. Retrieving all the threads.') - threads = crawler.get_threads() + threads = crawler.get_threads(args.raw_output) print('{0} thread(s) found.'.format(len(threads))) for thread_id in threads: diff --git a/dmarchiver/core.py b/dmarchiver/core.py index e194ad9..55cbb71 100644 --- a/dmarchiver/core.py +++ b/dmarchiver/core.py @@ -259,16 +259,24 @@ class Crawler(object): _max_id_found = False - def authenticate(self, username, password): + def authenticate(self, username, password, raw_output): login_url = self._twitter_base_url + '/login' sessions_url = self._twitter_base_url + '/sessions' self._session = requests.Session() + if raw_output: + raw_output_file = open( + 'authentication-{0}.txt'.format(username), 'wb') + response = self._session.get( login_url, headers=self._http_headers) + if raw_output: + raw_output_file.write(response.content) + raw_output_file.close() + document = lxml.html.document_fromstring(response.text) authenticity_token = document.xpath( '//input[@name="authenticity_token"]/@value')[0] @@ -285,26 +293,49 @@ def authenticate(self, username, password): if 'auth_token' in cookies: print('Authentication succeedeed.{0}'.format(os.linesep)) else: - raise PermissionError('Your username or password was invalid.') + raise PermissionError('Your username or password was invalid. Note: DMArchiver does not support multi-factor authentication or application passwords.') - def get_threads(self): + def get_threads(self, raw_output): threads = [] messages_url = self._twitter_base_url + '/messages' payload = {} + first_request = False + if raw_output: + raw_output_file = open( + 'conversation-list.txt', 'wb') while True: response = self._session.get( messages_url, headers=self._ajax_headers, params=payload) + + if raw_output: + raw_output_file.write(response.content) json = response.json() - threads += json['inner']['trusted']['threads'] - if json['inner']['trusted']['has_more'] == False: + try: + if first_request == False: + first_request = True + threads += json['inner']['trusted']['threads'] + + if json['inner']['trusted']['has_more'] == False: + break + else: + threads += json['trusted']['threads'] + + if json['trusted']['has_more'] == False: + break + + payload = {'is_trusted': 'true', 'max_entry_id': json['inner']['trusted']['min_entry_id']} + messages_url = self._twitter_base_url + '/inbox/paginate?is_trusted=true&max_entry_id=' + json['inner']['trusted']['min_entry_id'] + except KeyError as e: + print('Unable to parse the list of the conversations. Maybe your account is locked or Twitter has updated the HTML code. Use -r to get the raw output and post an issue on GitHub. Exception: {0}'.format(str(e))) break - payload = {'max_entry_id': json['inner']['trusted']['min_entry_id']} + if raw_output: + raw_output_file.close() return threads