Skip to content

Commit

Permalink
Sorting all library imports with isort
Browse files Browse the repository at this point in the history
  • Loading branch information
D4Vinci committed Nov 30, 2024
1 parent 26bfa88 commit 1a17b2c
Show file tree
Hide file tree
Showing 24 changed files with 108 additions and 113 deletions.
15 changes: 8 additions & 7 deletions benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import functools
import time
import timeit
import functools
import requests
from statistics import mean

from scrapling import Adaptor
from parsel import Selector
from lxml import etree, html
import requests
from autoscraper import AutoScraper
from bs4 import BeautifulSoup
from lxml import etree, html
from mechanicalsoup import StatefulBrowser
from parsel import Selector
from pyquery import PyQuery as pq
from autoscraper import AutoScraper
from selectolax.parser import HTMLParser
from mechanicalsoup import StatefulBrowser

from scrapling import Adaptor

large_html = '<html><body>' + '<div class="item">' * 5000 + '</div>' * 5000 + '</body></html>'

Expand Down
2 changes: 1 addition & 1 deletion docs/Examples/selectorless_stackoverflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

import requests

from scrapling import Adaptor

response = requests.get('https://stackoverflow.com/questions/tagged/web-scraping?sort=MostVotes&filters=NoAcceptedAnswer&edited=true&pagesize=50&page=2')
Expand All @@ -22,4 +23,3 @@
# We will get all the rest of the titles/authors in the page depending on the first title and the first author we got above as a starting point
for i, (title, author) in enumerate(zip(first_question_title.find_similar(), first_question_author.find_similar()), start=1):
print(i, title.text, author.text)

5 changes: 3 additions & 2 deletions scrapling/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
# Declare top-level shortcuts
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
from scrapling.core.custom_types import AttributesHandler, TextHandler
from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
StealthyFetcher)
from scrapling.parser import Adaptor, Adaptors
from scrapling.core.custom_types import TextHandler, AttributesHandler

__author__ = "Karim Shoair ([email protected])"
__version__ = "0.2.7"
Expand Down
5 changes: 2 additions & 3 deletions scrapling/core/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
Type definitions for type checking purposes.
"""

from typing import (
Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
)
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
List, Literal, Optional, Pattern, Tuple, Type, Union)

try:
from typing import Protocol
Expand Down
10 changes: 5 additions & 5 deletions scrapling/core/custom_types.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import re
from types import MappingProxyType
from collections.abc import Mapping
from types import MappingProxyType

from scrapling.core.utils import _is_iterable, flatten
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex

from orjson import loads, dumps
from orjson import dumps, loads
from w3lib.html import replace_entities as _replace_entities

from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
from scrapling.core.utils import _is_iterable, flatten


class TextHandler(str):
"""Extends standard Python string by adding more functionality"""
Expand Down
12 changes: 6 additions & 6 deletions scrapling/core/storage_adaptors.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,16 @@
import orjson
import sqlite3
import logging
import sqlite3
import threading
from hashlib import sha256
from abc import ABC, abstractmethod
from hashlib import sha256

from scrapling.core._types import Dict, Optional, Union
from scrapling.core.utils import _StorageTools, cache

import orjson
from lxml import html
from tldextract import extract as tld

from scrapling.core._types import Dict, Optional, Union
from scrapling.core.utils import _StorageTools, cache


class StorageSystemMixin(ABC):
# If you want to make your own storage system, you have to inherit from this
Expand Down
11 changes: 5 additions & 6 deletions scrapling/core/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,14 @@

import re

from w3lib.html import HTML5_WHITESPACE
from scrapling.core.utils import cache
from scrapling.core._types import Any, Optional, Protocol, Self

from cssselect.xpath import ExpressionError
from cssselect.xpath import XPathExpr as OriginalXPathExpr
from cssselect import HTMLTranslator as OriginalHTMLTranslator
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
from cssselect.xpath import ExpressionError
from cssselect.xpath import XPathExpr as OriginalXPathExpr
from w3lib.html import HTML5_WHITESPACE

from scrapling.core._types import Any, Optional, Protocol, Self
from scrapling.core.utils import cache

regex = f"[{HTML5_WHITESPACE}]+"
replace_html5_whitespaces = re.compile(regex).sub
Expand Down
13 changes: 8 additions & 5 deletions scrapling/core/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import re
import logging
import re
from itertools import chain
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache

from scrapling.core._types import Dict, Iterable, Any, Union

import orjson
from lxml import html

from scrapling.core._types import Any, Dict, Iterable, Union

# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
from functools import lru_cache as cache # isort:skip


html_forbidden = {html.HtmlComment, }
logging.basicConfig(
level=logging.ERROR,
Expand Down
2 changes: 1 addition & 1 deletion scrapling/defaults.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher

# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
Fetcher = Fetcher()
Expand Down
4 changes: 2 additions & 2 deletions scrapling/engines/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from .camo import CamoufoxEngine
from .static import StaticEngine
from .pw import PlaywrightEngine
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
from .pw import PlaywrightEngine
from .static import StaticEngine
from .toolbelt import check_if_engine_usable

__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
20 changes: 8 additions & 12 deletions scrapling/engines/camo.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
import logging
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal

from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
get_os_name,
intercept_route,
check_type_validity,
construct_proxy_dict,
generate_convincing_referer,
)

from camoufox import DefaultAddons
from camoufox.sync_api import Camoufox

from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
Union)
from scrapling.engines.toolbelt import (Response, StatusText,
check_type_validity,
construct_proxy_dict, do_nothing,
generate_convincing_referer,
get_os_name, intercept_route)


class CamoufoxEngine:
def __init__(
Expand Down
23 changes: 9 additions & 14 deletions scrapling/engines/pw.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,15 @@
import json
import logging
from scrapling.core._types import Union, Callable, Optional, List, Dict

from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
from scrapling.engines.toolbelt import (
Response,
do_nothing,
StatusText,
js_bypass_path,
intercept_route,
generate_headers,
construct_cdp_url,
check_type_validity,
construct_proxy_dict,
generate_convincing_referer,
)
from scrapling.core._types import Callable, Dict, List, Optional, Union
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
NSTBROWSER_DEFAULT_QUERY)
from scrapling.engines.toolbelt import (Response, StatusText,
check_type_validity, construct_cdp_url,
construct_proxy_dict, do_nothing,
generate_convincing_referer,
generate_headers, intercept_route,
js_bypass_path)


class PlaywrightEngine:
Expand Down
7 changes: 4 additions & 3 deletions scrapling/engines/static.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import logging

from scrapling.core._types import Union, Optional, Dict
from .toolbelt import Response, generate_convincing_referer, generate_headers

import httpx
from httpx._models import Response as httpxResponse

from scrapling.core._types import Dict, Optional, Union

from .toolbelt import Response, generate_convincing_referer, generate_headers


class StaticEngine:
def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
Expand Down
26 changes: 6 additions & 20 deletions scrapling/engines/toolbelt/__init__.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,6 @@
from .fingerprints import (
get_os_name,
generate_headers,
generate_convincing_referer,
)
from .custom import (
Response,
do_nothing,
StatusText,
BaseFetcher,
get_variable_name,
check_type_validity,
check_if_engine_usable,
)
from .navigation import (
js_bypass_path,
intercept_route,
construct_cdp_url,
construct_proxy_dict,
)
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
check_type_validity, do_nothing, get_variable_name)
from .fingerprints import (generate_convincing_referer, generate_headers,
get_os_name)
from .navigation import (construct_cdp_url, construct_proxy_dict,
intercept_route, js_bypass_path)
5 changes: 3 additions & 2 deletions scrapling/engines/toolbelt/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@
import logging
from email.message import Message

from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
Type, Union)
from scrapling.core.custom_types import MappingProxyType
from scrapling.core.utils import cache, setup_basic_logging
from scrapling.parser import Adaptor, SQLiteStorageSystem
from scrapling.core.utils import setup_basic_logging, cache
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple


class ResponseEncoding:
Expand Down
10 changes: 5 additions & 5 deletions scrapling/engines/toolbelt/fingerprints.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

import platform

from scrapling.core.utils import cache
from scrapling.core._types import Union, Dict

from browserforge.fingerprints import Fingerprint, FingerprintGenerator
from browserforge.headers import Browser, HeaderGenerator
from tldextract import extract
from browserforge.headers import HeaderGenerator, Browser
from browserforge.fingerprints import FingerprintGenerator, Fingerprint

from scrapling.core._types import Dict, Union
from scrapling.core.utils import cache


@cache(None, typed=True)
Expand Down
10 changes: 5 additions & 5 deletions scrapling/engines/toolbelt/navigation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,16 @@
Functions related to files and URLs
"""

import os
import logging
from urllib.parse import urlparse, urlencode
import os
from urllib.parse import urlencode, urlparse

from playwright.sync_api import Route

from scrapling.core._types import Dict, Optional, Union
from scrapling.core.utils import cache
from scrapling.core._types import Union, Dict, Optional
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES

from playwright.sync_api import Route


def intercept_route(route: Route) -> Union[Route, None]:
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
Expand Down
9 changes: 5 additions & 4 deletions scrapling/fetchers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from scrapling.core._types import Dict, Optional, Union, Callable, List, Literal

from scrapling.engines.toolbelt import Response, BaseFetcher, do_nothing
from scrapling.engines import CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
Union)
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
check_if_engine_usable)
from scrapling.engines.toolbelt import BaseFetcher, Response, do_nothing


class Fetcher(BaseFetcher):
Expand Down
23 changes: 15 additions & 8 deletions scrapling/parser.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,23 @@
import inspect
import os
import re
import inspect
from difflib import SequenceMatcher

from scrapling.core.translator import HTMLTranslator
from scrapling.core.mixins import SelectorsGeneration
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
from cssselect import SelectorError, SelectorSyntaxError
from cssselect import parse as split_selectors
from lxml import etree, html
from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors

from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
List, Optional, Pattern, SupportsIndex,
Tuple, Union)
from scrapling.core.custom_types import (AttributesHandler, TextHandler,
TextHandlers)
from scrapling.core.mixins import SelectorsGeneration
from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
StorageSystemMixin, _StorageTools)
from scrapling.core.translator import HTMLTranslator
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
is_jsonable, logging, setup_basic_logging)


class Adaptor(SelectorsGeneration):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from setuptools import setup, find_packages
from setuptools import find_packages, setup

with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
Expand Down
1 change: 1 addition & 0 deletions tests/fetchers/test_camoufox.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest

import pytest_httpbin

from scrapling import StealthyFetcher
Expand Down
1 change: 1 addition & 0 deletions tests/fetchers/test_httpx.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest

import pytest_httpbin

from scrapling import Fetcher
Expand Down
1 change: 1 addition & 0 deletions tests/fetchers/test_playwright.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import unittest

import pytest_httpbin

from scrapling import PlayWrightFetcher
Expand Down
Loading

0 comments on commit 1a17b2c

Please sign in to comment.