Fixes #27

Adds support for Twitter's paginate feature Adds raw output for authentication and thread list retrieval
Mincka · Sep 30, 2017 · c7a2598 · c7a2598
1 parent 8a29362
commit c7a2598
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ By running the tool without any argument, you will be only prompted for your use
 
 Download a Windows build from the [project releases](https://github.com/Mincka/DMArchiver/releases).
 
-Unzip the archive in a temporary folder and double-click the executable or run it in a Command Prompt:
+Unzip the archive in a temporary folder and double-click the executable or run it in a Command Prompt (mandatory if you want to use parameters to download images and videos):
 ```
 > C:\Temp\DMArchiver.exe
 ```
@@ -49,7 +49,7 @@ Note: If you run the tool directly from the zip archive window, it may fail when
 
 Download a macOS build from the [project releases](https://github.com/Mincka/DMArchiver/releases).
 
-Then click on the executable, or run Terminal and execute the following commands:
+Then click on the executable, or run Terminal and execute the following commands (mandatory if you want to use parameters to download images and videos):
 ```
 $ cd Downloads
 $ ./dmarchiver
@@ -143,14 +143,6 @@ You can also specify the username and the password in the options. Because DMArc
 $ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password
 ```
 
-### Module import
-```python
->>> from dmarchiver.core import Crawler
->>> crawler = Crawler()
->>> crawler.authenticate('username', 'password')
->>> crawler.crawl('conversation_id')
-```
-
 ## Development
 
 ### Ubuntu / Windows
@@ -161,6 +153,7 @@ $ cd DMArchiver
 $ virtualenv venv
 $ source venv/bin/activate # "venv/Scripts/Activate.bat" on Windows
 $ pip install -r requirements.txt
+$ python -m dmarchiver.cmdline
 ```
 
 ### Mac OS X / macOS

diff --git a/dmarchiver/cmdline.py b/dmarchiver/cmdline.py
@@ -24,9 +24,12 @@
 import argparse
 import getpass
 import sys
-from dmarchiver import __version__
-from dmarchiver.core import Crawler
-
+if __name__ == '__main__':
+    from dmarchiver import __version__
+    from dmarchiver.core import Crawler
+else:
+    from .__init__ import __version__
+    from .core import Crawler
 
 def main():
     print("DMArchiver {0}".format(__version__))
@@ -72,7 +75,7 @@ def main():
 
     crawler = Crawler()
     try:
-        crawler.authenticate(username, password)
+        crawler.authenticate(username, password, args.raw_output)
     except PermissionError as err:
         print('Error: {0}'.format(err.args[0]))
         print('Exiting.')
@@ -90,10 +93,10 @@ def main():
             crawler.crawl(
                 conversation_id,
                 args.download_images,
-                args.download_gifs, args.raw_output)
+                args.download_gifs, args.download_videos, args.raw_output)
         else:
             print('Conversation ID not specified. Retrieving all the threads.')
-            threads = crawler.get_threads()
+            threads = crawler.get_threads(args.raw_output)
             print('{0} thread(s) found.'.format(len(threads)))
 
             for thread_id in threads:

diff --git a/dmarchiver/core.py b/dmarchiver/core.py
@@ -259,16 +259,24 @@ class Crawler(object):
 
     _max_id_found = False
 
-    def authenticate(self, username, password):
+    def authenticate(self, username, password, raw_output):
         login_url = self._twitter_base_url + '/login'
         sessions_url = self._twitter_base_url + '/sessions'
 
         self._session = requests.Session()
 
+        if raw_output:
+            raw_output_file = open(
+                'authentication-{0}.txt'.format(username), 'wb')
+
         response = self._session.get(
             login_url,
             headers=self._http_headers)
 
+        if raw_output:
+            raw_output_file.write(response.content)
+            raw_output_file.close()
+
         document = lxml.html.document_fromstring(response.text)
         authenticity_token = document.xpath(
             '//input[@name="authenticity_token"]/@value')[0]
@@ -285,26 +293,49 @@ def authenticate(self, username, password):
         if 'auth_token' in cookies:
             print('Authentication succeedeed.{0}'.format(os.linesep))
         else:
-            raise PermissionError('Your username or password was invalid.')
+            raise PermissionError('Your username or password was invalid. Note: DMArchiver does not support multi-factor authentication or application passwords.')
 
-    def get_threads(self):
+    def get_threads(self, raw_output):
         threads = []
         messages_url = self._twitter_base_url + '/messages'
         payload = {}
+        first_request = False
+        if raw_output:
+            raw_output_file = open(
+                'conversation-list.txt', 'wb')
 
         while True:
             response = self._session.get(
                 messages_url,
                 headers=self._ajax_headers,
                 params=payload)
+
+            if raw_output:
+                raw_output_file.write(response.content)
 
             json = response.json()
-            threads += json['inner']['trusted']['threads']
 
-            if json['inner']['trusted']['has_more'] == False:
+            try:
+                if first_request == False:
+                    first_request = True
+                    threads += json['inner']['trusted']['threads']
+
+                    if json['inner']['trusted']['has_more'] == False:
+                        break
+                else:
+                    threads += json['trusted']['threads']
+
+                    if json['trusted']['has_more'] == False:
+                        break
+
+                payload = {'is_trusted': 'true', 'max_entry_id': json['inner']['trusted']['min_entry_id']}
+                messages_url = self._twitter_base_url + '/inbox/paginate?is_trusted=true&max_entry_id=' + json['inner']['trusted']['min_entry_id']
+            except KeyError as e:
+                print('Unable to parse the list of the conversations. Maybe your account is locked or Twitter has updated the HTML code. Use -r to get the raw output and post an issue on GitHub. Exception: {0}'.format(str(e)))
                 break
 
-            payload = {'max_entry_id': json['inner']['trusted']['min_entry_id']}
+        if raw_output:
+            raw_output_file.close()
 
         return threads