-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit and release 0.0.1 to PyPi
- Loading branch information
Aditya Dedhia
committed
May 10, 2023
0 parents
commit 4361221
Showing
8 changed files
with
899 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# Virtual Environment | ||
venv/ | ||
|
||
# OS Cache | ||
.DS_Store | ||
|
||
# Python Cache | ||
__pycache__/ | ||
|
||
# Build Files | ||
dist/ |
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
# neturalise-link | ||
|
||
## What are the objectives? | ||
|
||
- Remove trackers | ||
- Remove referrers | ||
- Identify malicious intent | ||
- Verify URL validity | ||
- Improve URL load speeds | ||
|
||
## Getting started | ||
|
||
### Prerequisites | ||
|
||
- Have python3 installed (e.g. using anaconda/homebrew) | ||
- Have build installed - `python3 -m pip install --upgrade build` | ||
|
||
### Building the package | ||
|
||
Navigate to root directory of the project and run: `python3 -m build` | ||
|
||
Install the package found in `neutralise-link/dist/` | ||
in your repo using `pip3 install` followed by the relative path of the `.tar.gz` package file located in the project. | ||
|
||
## How does it work? | ||
|
||
Having imported `neutralise-link` you may use the `neutralise` function which takes a URL string as the argument. | ||
|
||
By default, the function will return `None` in two cases: | ||
|
||
1. The link is invalid | ||
2. The link is deemed malicious | ||
|
||
> You may override the 2nd case by calling the function with the optional parameter, `safe=false`. | ||
--- | ||
|
||
## Example Code | ||
|
||
``` python | ||
from neutralise_link import neutralise | ||
|
||
def main(url: str) -> str: | ||
"""Validate user URL input for storing.""" | ||
|
||
url = neutralise(url=url, safe=True) | ||
if not url: | ||
print("URL is malformed or malicious.") | ||
print("URL is safe.") | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
[build-system] | ||
requires = [ | ||
"hatchling", | ||
"certifi", | ||
"charset-normalizer", | ||
"idna", | ||
"requests", | ||
"urllib3", | ||
] | ||
build-backend = "hatchling.build" | ||
|
||
[project] | ||
name = "neutralise_link" | ||
version = "0.0.1" | ||
authors = [ | ||
{ name="Aditya Dedhia", email="[email protected]" } | ||
] | ||
description="Validating, cleaning, and compactifying URLs simplified." | ||
readme = "README.md" | ||
requires-python = ">=3.9" | ||
classifiers=[ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)", | ||
"Operating System :: OS Independent", | ||
] | ||
|
||
[project.urls] | ||
"Homepage" = "https://github.com/brainpolo/neutralise-link" | ||
"Bug Tracker" = "https://github.com/brainpolo/neutralise-link/issues" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
certifi==2023.5.7 | ||
charset-normalizer==3.1.0 | ||
idna==3.4 | ||
requests==2.30.0 | ||
urllib3==2.0.2 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
# Copyright (C) 2023 Aditya Dedhia | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
|
||
|
||
from .main import is_mal, is_valid, rem_refs, rem_trackers, compactify | ||
|
||
from .logger import logger | ||
|
||
|
||
def neutralise(url: str, safe=True) -> str: | ||
""" | ||
Handles the total cleansing of a URL input | ||
""" | ||
url = url.strip() | ||
if not url.startswith("http"): | ||
url = "https://" + url | ||
if safe and is_mal(url): # * Default safe mode can be overriden | ||
logger.info("Malicious URL detected: %s", url) | ||
return None | ||
if not is_valid(url): | ||
logger.info("Invalid URL detected: %s", url) | ||
return None | ||
|
||
logger.info("URL is valid and will be processed.") | ||
|
||
url = rem_refs(url) | ||
url = rem_trackers(url) | ||
if not is_valid(url): | ||
logger.info("Invalid URL after filtering referrers and trackers: %s", url) | ||
return None | ||
minified_url = compactify(url) | ||
|
||
if not is_valid(minified_url): | ||
logger.info("Invalid URL after minifying: %s", url) | ||
return url | ||
return minified_url |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
# Copyright (C) 2023 Aditya Dedhia | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
|
||
import logging | ||
|
||
logger = logging.getLogger("app") | ||
logger.setLevel(logging.DEBUG) | ||
ch = logging.StreamHandler() | ||
ch.setLevel(logging.DEBUG) | ||
formatter = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s") | ||
ch.setFormatter(formatter) | ||
logger.addHandler(ch) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# Copyright (C) 2023 Aditya Dedhia | ||
# | ||
# This program is free software: you can redistribute it and/or modify | ||
# it under the terms of the GNU General Public License as published by | ||
# the Free Software Foundation, either version 3 of the License, or | ||
# (at your option) any later version. | ||
|
||
|
||
import re | ||
import requests | ||
|
||
|
||
def rem_refs(url: str) -> str: | ||
""" | ||
Removes the referrer from a URL | ||
""" | ||
|
||
source_pattern = r"&sourceid=.*?(?=&|$)" | ||
url = re.sub(source_pattern, "&", url) | ||
client_pattern = r"&sclient=.*?(?=&|$)" | ||
url = re.sub(client_pattern, "", url) | ||
utm_source_pattern = r"&utm_source=.*?(?=&|$)" | ||
url = re.sub(utm_source_pattern, "&", url) | ||
utm_medium_pattern = r"&utm_medium=.*?(?=&|$)" | ||
url = re.sub(utm_medium_pattern, "&", url) | ||
utm_campaign_pattern = r"&utm_campaign=.*?(?=&|$)" | ||
url = re.sub(utm_campaign_pattern, "&", url) | ||
return url | ||
|
||
|
||
def rem_trackers(url: str) -> str: | ||
""" | ||
Removes the trackers from a URL | ||
""" | ||
|
||
event_id_pattern = r"&ei=.*?(?=&|$)" | ||
url = re.sub(event_id_pattern, "&", url) | ||
googl_aqs_pattern = r"&aqs=.*?(?=&|$)" | ||
url = re.sub(googl_aqs_pattern, "&", url) | ||
viewer_data_pattern = r"&ved=.*?(?=&|$)" | ||
url = re.sub(viewer_data_pattern, "&", url) | ||
user_act_pattern = r"&uact=.*?(?=&|$)" | ||
url = re.sub(user_act_pattern, "&", url) | ||
click_pos_pattern = r"&gs_lcp=.*?(?=&|$)" | ||
url = re.sub(click_pos_pattern, "&", url) | ||
mkt_token_pattern = r"&mkt_tok=.*?(?=&|$)" | ||
url = re.sub(mkt_token_pattern, "&", url) | ||
return url | ||
|
||
|
||
def compactify(url: str) -> str: | ||
""" | ||
Removes the visual elements of a URL primarily for cosmetic purposes. | ||
If fails, returns the former minified URL. | ||
""" | ||
|
||
url = url.replace("www.", "") | ||
|
||
return url | ||
|
||
|
||
def is_mal(url: str) -> bool: | ||
if "&backfill=" in url: | ||
return True | ||
return False | ||
|
||
|
||
def is_valid(url: str) -> bool: | ||
try: | ||
response = requests.head(url) | ||
response.raise_for_status() # * Raised if status is 4xx, 5xx | ||
return True | ||
except requests.exceptions.RequestException: | ||
return False |