Skip to content

Add the max_messages parameter to limit parsing and responses #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions tchan.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,10 +121,13 @@ def parse_info(tree):
)


def parse_messages(original_url, tree):
"Retrieve messages from HTML tree"
def parse_messages(original_url, tree, max_messages=None):
"Retrieve messages from HTML tree, with optional max_messages limit"
messages = tree.xpath("//div[contains(@class, 'tgme_widget_message_wrap')]")
count = 0
for message in reversed(messages):
if max_messages is not None and count >= max_messages:
break
if message.xpath(".//div[contains(@class, 'tme_no_messages_found')]"):
# XXX: this case may happen because a great number of requests was
# made and Telegram sent this response as if there were no new
Expand Down Expand Up @@ -381,6 +384,7 @@ def parse_messages(original_url, tree):
forwarded_author=forwarded_author,
forwarded_author_url=forwarded_author_url,
)
count += 1


class ChannelScraper:
Expand All @@ -394,17 +398,27 @@ def info(self, username_or_url):
tree = document_fromstring(response.text)
return parse_info(tree)

def messages(self, username_or_url):
"Get messages from a channel, paginating until it ends"
def messages(self, username_or_url, max_messages=None):
"Get messages from a channel, paginating until it ends or max_messages is reached"
url = normalize_url(username_or_url)

last_captured_id = None
total_count = 0
while True:
response = self.session.get(url)
tree = document_fromstring(response.text)
for message in parse_messages(url, tree):
# Calculate how many messages to fetch in this page
page_limit = None
if max_messages is not None:
page_limit = max_messages - total_count
if page_limit <= 0:
break
for message in parse_messages(url, tree, max_messages=page_limit):
last_captured_id = message.id
yield message
total_count += 1
if max_messages is not None and total_count >= max_messages:
return
next_page_url = tree.xpath("//link[@rel = 'prev']/@href")
if not next_page_url:
if last_captured_id is not None and message.id > 20:
Expand Down