Version 0.1.2

D4Vinci · D4Vinci · commit a01cfe66dbd0 · 2024-10-16T18:21:56.000+03:00
Check the changelog in releases
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,4 +1,6 @@
 include LICENSE
+include *.db
+include scrapling/*.db
 include scrapling/py.typed
 
 recursive-exclude * __pycache__
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # 🕷️ Scrapling: Lightning-Fast, Adaptive Web Scraping for Python
-[![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![License](https://img.shields.io/badge/License-BSD--3-blue.svg)](https://opensource.org/licenses/BSD-3-Clause)
+[![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
 
 Dealing with failing web scrapers due to website changes? Meet Scrapling.
 
@@ -415,6 +415,9 @@ Of course, you can find elements by text/regex, find similar elements in a more
 ### Is Scrapling thread-safe?
 Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its own state.
 
+## Sponsors
+[![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
+
 ## Contributing
 Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
 
diff --git a/images/CapSolver.png b/images/CapSolver.png
diff --git a/scrapling/__init__.py b/scrapling/__init__.py
@@ -3,7 +3,7 @@
 from scrapling.custom_types import TextHandler, AttributesHandler
 
 __author__ = "Karim Shoair (karim.shoair@pm.me)"
-__version__ = "0.1.1"
+__version__ = "0.1.2"
 __copyright__ = "Copyright (c) 2024 Karim Shoair"
 
 
diff --git a/scrapling/parser.py b/scrapling/parser.py
@@ -78,7 +78,7 @@ def __init__(
 
             parser = html.HTMLParser(
                 # https://lxml.de/api/lxml.etree.HTMLParser-class.html
-                recover=True, remove_blank_text=True, remove_comments=(keep_comments is True), encoding=encoding,
+                recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
                 compact=True, huge_tree=huge_tree, default_doctype=True
             )
             self._root = etree.fromstring(body, parser=parser, base_url=url)
@@ -142,7 +142,8 @@ def __get_correct_result(
             if issubclass(type(element), html.HtmlMixin):
                 return self.__class__(
                     root=element, url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
-                    keep_comments=self.__keep_comments, huge_tree=self.__huge_tree_enabled, debug=self.__debug
+                    keep_comments=True,  # if the comments are already removed in initialization, no need to try to delete them in sub-elements
+                    huge_tree=self.__huge_tree_enabled, debug=self.__debug
                 )
             return element
 
@@ -186,7 +187,19 @@ def tag(self) -> str:
     def text(self) -> TextHandler:
         """Get text content of the element"""
         if not self.__text:
-            self.__text = TextHandler(self._root.text)
+            if self.__keep_comments:
+                # If use chose to keep comments, remove comments from text
+                # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
+                # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
+                code = self.html_content
+                parser = html.HTMLParser(
+                    recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
+                    compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
+                )
+                fragment_root = html.fragment_fromstring(code, parser=parser)
+                self.__text = TextHandler(fragment_root.text)
+            else:
+                self.__text = TextHandler(self._root.text)
         return self.__text
 
     def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
diff --git a/setup.cfg b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = scrapling
-version = 0.1.1
+version = 0.1.2
 author = Karim Shoair
 author_email = karim.shoair@pm.me
 description = Scrapling is a powerful, flexible, adaptive, and high-performance web scraping library for Python.
diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name="scrapling",
-    version="0.1.1",
+    version="0.1.2",
     description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It 
     simplifies the process of extracting data from websites, even when they undergo structural changes, and offers 
     impressive speed improvements over many popular scraping tools.""",