Skip to content

Commit a01cfe6

Browse files
committed
Version 0.1.2
Check the changelog in releases
1 parent f0a21f6 commit a01cfe6

File tree

7 files changed

+25
-7
lines changed

7 files changed

+25
-7
lines changed

MANIFEST.in

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
include LICENSE
2+
include *.db
3+
include scrapling/*.db
24
include scrapling/py.typed
35

46
recursive-exclude * __pycache__

README.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
2-
[![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![License](https://img.shields.io/badge/License-BSD--3-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
2+
[![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
33

44
Dealing with failing web scrapers due to website changes? Meet Scrapling.
55

@@ -415,6 +415,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
415415
### Is Scrapling thread-safe?
416416
Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
417417
418+
## Sponsors
419+
[![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
420+
418421
## Contributing
419422
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
420423

images/CapSolver.png

173 KB
Loading

scrapling/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from scrapling.custom_types import TextHandler, AttributesHandler
44

55
__author__ = "Karim Shoair ([email protected])"
6-
__version__ = "0.1.1"
6+
__version__ = "0.1.2"
77
__copyright__ = "Copyright (c) 2024 Karim Shoair"
88

99

scrapling/parser.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,7 @@ def __init__(
7878

7979
parser = html.HTMLParser(
8080
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
81-
recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
81+
recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
8282
compact=True, huge_tree=huge_tree, default_doctype=True
8383
)
8484
self._root = etree.fromstring(body, parser=parser, base_url=url)
@@ -142,7 +142,8 @@ def __get_correct_result(
142142
if issubclass(type(element), html.HtmlMixin):
143143
return self.__class__(
144144
root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
145-
keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
145+
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
146+
huge_tree=self.__huge_tree_enabled, debug=self.__debug
146147
)
147148
return element
148149

@@ -186,7 +187,19 @@ def tag(self) -> str:
186187
def text(self) -> TextHandler:
187188
"""Get text content of the element"""
188189
if not self.__text:
189-
self.__text = TextHandler(self._root.text)
190+
if self.__keep_comments:
191+
# If use chose to keep comments, remove comments from text
192+
# Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
193+
# This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
194+
code = self.html_content
195+
parser = html.HTMLParser(
196+
recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
197+
compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
198+
)
199+
fragment_root = html.fragment_fromstring(code, parser=parser)
200+
self.__text = TextHandler(fragment_root.text)
201+
else:
202+
self.__text = TextHandler(self._root.text)
190203
return self.__text
191204

192205
def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = scrapling
3-
version = 0.1.1
3+
version = 0.1.2
44
author = Karim Shoair
55
author_email = [email protected]
66
description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
setup(
88
name="scrapling",
9-
version="0.1.1",
9+
version="0.1.2",
1010
description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
1111
simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
1212
impressive speed improvements over many popular scraping tools.""",

0 commit comments

Comments
 (0)