Skip to content

Commit

Permalink
Initial commit and release 0.0.1 to PyPi
Browse files Browse the repository at this point in the history
  • Loading branch information
Aditya Dedhia committed May 10, 2023
0 parents commit 4361221
Show file tree
Hide file tree
Showing 8 changed files with 899 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Virtual Environment
venv/

# OS Cache
.DS_Store

# Python Cache
__pycache__/

# Build Files
dist/
674 changes: 674 additions & 0 deletions LICENSE

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# neturalise-link

## What are the objectives?

- Remove trackers
- Remove referrers
- Identify malicious intent
- Verify URL validity
- Improve URL load speeds

## Getting started

### Prerequisites

- Have python3 installed (e.g. using anaconda/homebrew)
- Have build installed - `python3 -m pip install --upgrade build`

### Building the package

Navigate to root directory of the project and run: `python3 -m build`

Install the package found in `neutralise-link/dist/`
in your repo using `pip3 install` followed by the relative path of the `.tar.gz` package file located in the project.

## How does it work?

Having imported `neutralise-link` you may use the `neutralise` function which takes a URL string as the argument.

By default, the function will return `None` in two cases:

1. The link is invalid
2. The link is deemed malicious

> You may override the 2nd case by calling the function with the optional parameter, `safe=false`.
---

## Example Code

``` python
from neutralise_link import neutralise

def main(url: str) -> str:
"""Validate user URL input for storing."""

url = neutralise(url=url, safe=True)
if not url:
print("URL is malformed or malicious.")
print("URL is safe.")
```
29 changes: 29 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[build-system]
requires = [
"hatchling",
"certifi",
"charset-normalizer",
"idna",
"requests",
"urllib3",
]
build-backend = "hatchling.build"

[project]
name = "neutralise_link"
version = "0.0.1"
authors = [
{ name="Aditya Dedhia", email="[email protected]" }
]
description="Validating, cleaning, and compactifying URLs simplified."
readme = "README.md"
requires-python = ">=3.9"
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
"Operating System :: OS Independent",
]

[project.urls]
"Homepage" = "https://github.com/brainpolo/neutralise-link"
"Bug Tracker" = "https://github.com/brainpolo/neutralise-link/issues"
5 changes: 5 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
certifi==2023.5.7
charset-normalizer==3.1.0
idna==3.4
requests==2.30.0
urllib3==2.0.2
40 changes: 40 additions & 0 deletions src/neutralise_link/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Copyright (C) 2023 Aditya Dedhia
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.


from .main import is_mal, is_valid, rem_refs, rem_trackers, compactify

from .logger import logger


def neutralise(url: str, safe=True) -> str:
"""
Handles the total cleansing of a URL input
"""
url = url.strip()
if not url.startswith("http"):
url = "https://" + url
if safe and is_mal(url): # * Default safe mode can be overriden
logger.info("Malicious URL detected: %s", url)
return None
if not is_valid(url):
logger.info("Invalid URL detected: %s", url)
return None

logger.info("URL is valid and will be processed.")

url = rem_refs(url)
url = rem_trackers(url)
if not is_valid(url):
logger.info("Invalid URL after filtering referrers and trackers: %s", url)
return None
minified_url = compactify(url)

if not is_valid(minified_url):
logger.info("Invalid URL after minifying: %s", url)
return url
return minified_url
16 changes: 16 additions & 0 deletions src/neutralise_link/logger.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (C) 2023 Aditya Dedhia
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

import logging

logger = logging.getLogger("app")
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
formatter = logging.Formatter("[%(levelname)s] %(asctime)s - %(message)s")
ch.setFormatter(formatter)
logger.addHandler(ch)
74 changes: 74 additions & 0 deletions src/neutralise_link/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright (C) 2023 Aditya Dedhia
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.


import re
import requests


def rem_refs(url: str) -> str:
"""
Removes the referrer from a URL
"""

source_pattern = r"&sourceid=.*?(?=&|$)"
url = re.sub(source_pattern, "&", url)
client_pattern = r"&sclient=.*?(?=&|$)"
url = re.sub(client_pattern, "", url)
utm_source_pattern = r"&utm_source=.*?(?=&|$)"
url = re.sub(utm_source_pattern, "&", url)
utm_medium_pattern = r"&utm_medium=.*?(?=&|$)"
url = re.sub(utm_medium_pattern, "&", url)
utm_campaign_pattern = r"&utm_campaign=.*?(?=&|$)"
url = re.sub(utm_campaign_pattern, "&", url)
return url


def rem_trackers(url: str) -> str:
"""
Removes the trackers from a URL
"""

event_id_pattern = r"&ei=.*?(?=&|$)"
url = re.sub(event_id_pattern, "&", url)
googl_aqs_pattern = r"&aqs=.*?(?=&|$)"
url = re.sub(googl_aqs_pattern, "&", url)
viewer_data_pattern = r"&ved=.*?(?=&|$)"
url = re.sub(viewer_data_pattern, "&", url)
user_act_pattern = r"&uact=.*?(?=&|$)"
url = re.sub(user_act_pattern, "&", url)
click_pos_pattern = r"&gs_lcp=.*?(?=&|$)"
url = re.sub(click_pos_pattern, "&", url)
mkt_token_pattern = r"&mkt_tok=.*?(?=&|$)"
url = re.sub(mkt_token_pattern, "&", url)
return url


def compactify(url: str) -> str:
"""
Removes the visual elements of a URL primarily for cosmetic purposes.
If fails, returns the former minified URL.
"""

url = url.replace("www.", "")

return url


def is_mal(url: str) -> bool:
if "&backfill=" in url:
return True
return False


def is_valid(url: str) -> bool:
try:
response = requests.head(url)
response.raise_for_status() # * Raised if status is 4xx, 5xx
return True
except requests.exceptions.RequestException:
return False

0 comments on commit 4361221

Please sign in to comment.