Skip to content
This repository has been archived by the owner on Feb 28, 2023. It is now read-only.

Commit

Permalink
Fixes #27
Browse files Browse the repository at this point in the history
Adds support for Twitter's paginate feature
Adds raw output for authentication and thread list retrieval
  • Loading branch information
Mincka committed Sep 30, 2017
1 parent 8a29362 commit c7a2598
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 22 deletions.
13 changes: 3 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ By running the tool without any argument, you will be only prompted for your use

Download a Windows build from the [project releases](https://github.com/Mincka/DMArchiver/releases).

Unzip the archive in a temporary folder and double-click the executable or run it in a Command Prompt:
Unzip the archive in a temporary folder and double-click the executable or run it in a Command Prompt (mandatory if you want to use parameters to download images and videos):
```
> C:\Temp\DMArchiver.exe
```
Expand All @@ -49,7 +49,7 @@ Note: If you run the tool directly from the zip archive window, it may fail when

Download a macOS build from the [project releases](https://github.com/Mincka/DMArchiver/releases).

Then click on the executable, or run Terminal and execute the following commands:
Then click on the executable, or run Terminal and execute the following commands (mandatory if you want to use parameters to download images and videos):
```
$ cd Downloads
$ ./dmarchiver
Expand Down Expand Up @@ -143,14 +143,6 @@ You can also specify the username and the password in the options. Because DMArc
$ dmarchiver -id "conversation_id" -di -dg -dv -u your_username -p your_password
```

### Module import
```python
>>> from dmarchiver.core import Crawler
>>> crawler = Crawler()
>>> crawler.authenticate('username', 'password')
>>> crawler.crawl('conversation_id')
```

## Development

### Ubuntu / Windows
Expand All @@ -161,6 +153,7 @@ $ cd DMArchiver
$ virtualenv venv
$ source venv/bin/activate # "venv/Scripts/Activate.bat" on Windows
$ pip install -r requirements.txt
$ python -m dmarchiver.cmdline
```

### Mac OS X / macOS
Expand Down
15 changes: 9 additions & 6 deletions dmarchiver/cmdline.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,12 @@
import argparse
import getpass
import sys
from dmarchiver import __version__
from dmarchiver.core import Crawler

if __name__ == '__main__':
from dmarchiver import __version__
from dmarchiver.core import Crawler
else:
from .__init__ import __version__
from .core import Crawler

def main():
print("DMArchiver {0}".format(__version__))
Expand Down Expand Up @@ -72,7 +75,7 @@ def main():

crawler = Crawler()
try:
crawler.authenticate(username, password)
crawler.authenticate(username, password, args.raw_output)
except PermissionError as err:
print('Error: {0}'.format(err.args[0]))
print('Exiting.')
Expand All @@ -90,10 +93,10 @@ def main():
crawler.crawl(
conversation_id,
args.download_images,
args.download_gifs, args.raw_output)
args.download_gifs, args.download_videos, args.raw_output)
else:
print('Conversation ID not specified. Retrieving all the threads.')
threads = crawler.get_threads()
threads = crawler.get_threads(args.raw_output)
print('{0} thread(s) found.'.format(len(threads)))

for thread_id in threads:
Expand Down
43 changes: 37 additions & 6 deletions dmarchiver/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,16 +259,24 @@ class Crawler(object):

_max_id_found = False

def authenticate(self, username, password):
def authenticate(self, username, password, raw_output):
login_url = self._twitter_base_url + '/login'
sessions_url = self._twitter_base_url + '/sessions'

self._session = requests.Session()

if raw_output:
raw_output_file = open(
'authentication-{0}.txt'.format(username), 'wb')

response = self._session.get(
login_url,
headers=self._http_headers)

if raw_output:
raw_output_file.write(response.content)
raw_output_file.close()

document = lxml.html.document_fromstring(response.text)
authenticity_token = document.xpath(
'//input[@name="authenticity_token"]/@value')[0]
Expand All @@ -285,26 +293,49 @@ def authenticate(self, username, password):
if 'auth_token' in cookies:
print('Authentication succeedeed.{0}'.format(os.linesep))
else:
raise PermissionError('Your username or password was invalid.')
raise PermissionError('Your username or password was invalid. Note: DMArchiver does not support multi-factor authentication or application passwords.')

def get_threads(self):
def get_threads(self, raw_output):
threads = []
messages_url = self._twitter_base_url + '/messages'
payload = {}
first_request = False
if raw_output:
raw_output_file = open(
'conversation-list.txt', 'wb')

while True:
response = self._session.get(
messages_url,
headers=self._ajax_headers,
params=payload)

if raw_output:
raw_output_file.write(response.content)

json = response.json()
threads += json['inner']['trusted']['threads']

if json['inner']['trusted']['has_more'] == False:
try:
if first_request == False:
first_request = True
threads += json['inner']['trusted']['threads']

if json['inner']['trusted']['has_more'] == False:
break
else:
threads += json['trusted']['threads']

if json['trusted']['has_more'] == False:
break

payload = {'is_trusted': 'true', 'max_entry_id': json['inner']['trusted']['min_entry_id']}
messages_url = self._twitter_base_url + '/inbox/paginate?is_trusted=true&max_entry_id=' + json['inner']['trusted']['min_entry_id']
except KeyError as e:
print('Unable to parse the list of the conversations. Maybe your account is locked or Twitter has updated the HTML code. Use -r to get the raw output and post an issue on GitHub. Exception: {0}'.format(str(e)))
break

payload = {'max_entry_id': json['inner']['trusted']['min_entry_id']}
if raw_output:
raw_output_file.close()

return threads

Expand Down

0 comments on commit c7a2598

Please sign in to comment.