From 9d3940520d82bf792d1848a5a0090aeb173ed49d Mon Sep 17 00:00:00 2001 From: stitch1 Date: Mon, 6 Nov 2023 08:23:29 +0100 Subject: [PATCH 1/7] remove useless logging statements, use a clean entrypoint for subdomain tasks --- dashboard/internet_nl_dashboard/scanners/subdomains.py | 2 -- dashboard/internet_nl_dashboard/tasks.py | 4 ++-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/dashboard/internet_nl_dashboard/scanners/subdomains.py b/dashboard/internet_nl_dashboard/scanners/subdomains.py index 2e0ab492..bdfb3c55 100644 --- a/dashboard/internet_nl_dashboard/scanners/subdomains.py +++ b/dashboard/internet_nl_dashboard/scanners/subdomains.py @@ -57,11 +57,9 @@ def scan_status(account: Account, urllist_id: int): @app.task(queue="storage") def progress_subdomain_discovery_scans(): scans = SubdomainDiscoveryScan.objects.all().filter(state="requested") - log.error("yolo") tasks = [] for scan in scans: - log.error("swag") update_state(scan.id, "scanning") tasks.append(group(perform_subdomain_scan.si(scan.id) | update_state.si(scan.id, "finished"))) diff --git a/dashboard/internet_nl_dashboard/tasks.py b/dashboard/internet_nl_dashboard/tasks.py index 7f371536..7698eb60 100644 --- a/dashboard/internet_nl_dashboard/tasks.py +++ b/dashboard/internet_nl_dashboard/tasks.py @@ -9,7 +9,7 @@ from dashboard.celery import app from dashboard.internet_nl_dashboard.models import UrlList, UrlListReport -from dashboard.internet_nl_dashboard.scanners import scan_internet_nl_per_account +from dashboard.internet_nl_dashboard.scanners import scan_internet_nl_per_account, subdomains from dashboard.internet_nl_dashboard.scanners.scan_internet_nl_per_account import initialize_scan log = logging.getLogger(__name__) @@ -48,7 +48,7 @@ def start_scans_for_lists_who_are_up_for_scanning() -> Task: # explicitly declare the imported modules as this modules 'content', prevents pyflakes issues # Todo: List item 0 has incompatible type Module; expected Module -__all__: List[Module] = [scan_internet_nl_per_account] # type: ignore +__all__: List[Module] = [scan_internet_nl_per_account, subdomains] # type: ignore @app.task(queue='storage') From 6e0c7893840175ef2dfd0c517d6f4d3e6bcb6b63 Mon Sep 17 00:00:00 2001 From: stitch1 Date: Mon, 6 Nov 2023 08:32:17 +0100 Subject: [PATCH 2/7] fix INTERNET-NL-DASHBOARD-6A --- dashboard/internet_nl_dashboard/logic/domains.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dashboard/internet_nl_dashboard/logic/domains.py b/dashboard/internet_nl_dashboard/logic/domains.py index 9205fa79..cd2d2151 100644 --- a/dashboard/internet_nl_dashboard/logic/domains.py +++ b/dashboard/internet_nl_dashboard/logic/domains.py @@ -331,7 +331,7 @@ def update_list_settings(account: Account, user_input: Dict) -> Dict[str, Any]: last_report_prefetch = Prefetch( 'urllistreport_set', # filter(pk=UrlListReport.objects.latest('id').pk). - queryset=UrlListReport.objects.order_by('-id').only('id', 'at_when'), + queryset=UrlListReport.objects.order_by('-id').only('id', 'at_when', 'urllist__id'), to_attr='last_report' ) From d5134d9269437df9099a569c7f67f6dbee1dcb7f Mon Sep 17 00:00:00 2001 From: stitch1 Date: Mon, 6 Nov 2023 08:50:04 +0100 Subject: [PATCH 3/7] fix INTERNET-NL-DASHBOARD-6C handle spreadsheet uploads without the tags column --- dashboard/internet_nl_dashboard/logic/spreadsheet.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/dashboard/internet_nl_dashboard/logic/spreadsheet.py b/dashboard/internet_nl_dashboard/logic/spreadsheet.py index 4f522b00..7063d09c 100644 --- a/dashboard/internet_nl_dashboard/logic/spreadsheet.py +++ b/dashboard/internet_nl_dashboard/logic/spreadsheet.py @@ -154,7 +154,10 @@ def get_data(file: str) -> Dict[str, Dict[str, Dict[str, list]]]: # Data is parsed to python-like datatype. In this case we only expect strings and cast them as such. found_categories = str(row[0]).lower().strip().split(',') found_urls = str(row[1]).lower().strip().split(',') - found_tags = str(row[2]).lower().strip().split(',') + found_tags = [] + # if there is no tag column: + if len(row) > 2: + found_tags = str(row[2]).lower().strip().split(',') for found_category in found_categories: found_category = found_category.strip() From c7e4d2cca8b7cf1445eb4f6f4080f0f42d143b83 Mon Sep 17 00:00:00 2001 From: stitch1 Date: Mon, 6 Nov 2023 09:33:13 +0100 Subject: [PATCH 4/7] fix #483 spreadsheet uploads overwrite list content --- dashboard/internet_nl_dashboard/logic/spreadsheet.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dashboard/internet_nl_dashboard/logic/spreadsheet.py b/dashboard/internet_nl_dashboard/logic/spreadsheet.py index 7063d09c..15291c8a 100644 --- a/dashboard/internet_nl_dashboard/logic/spreadsheet.py +++ b/dashboard/internet_nl_dashboard/logic/spreadsheet.py @@ -347,11 +347,21 @@ def upload_domain_spreadsheet_to_list(account: Account, user: DashboardUser, url if not urllist: return {'error': True, 'success': False, 'message': 'list_does_not_exist', 'details': '', 'status': 'error'} + # the spreadsheet content is leading, this means that anything in the current list, including tags, will + # be removed. There is no smart merging strategy here. This might be added in the future: where we look + # at what is already in the list and only add changes. + # this will also remove the tags on this list automatically, without touching other lists that have the same + # url and different tags. + urllist.urls.clear() + # we don't care about the list name, we'll just add anything that is given as input... result = {'incorrect_urls': [], 'added_to_list': 0, 'already_in_list': 0} for _, domain_data in domain_lists.items(): + log.debug(domain_data) + # todo: when a tag has a domain, it might be added as a domain, which is wrong. Only use the first + # column of uploaded data. extracted_urls, _ = retrieve_possible_urls_from_unfiltered_input(", ".join(domain_data)) cleaned_urls = clean_urls(extracted_urls) From c6b4c7382d65bd21f2ec32b755da096ec81b537d Mon Sep 17 00:00:00 2001 From: stitch1 Date: Mon, 6 Nov 2023 09:57:20 +0100 Subject: [PATCH 5/7] fix INTERNET-NL-DASHBOARD-69 fix loading of templates of unsupported locales --- dashboard/internet_nl_dashboard/logic/mail.py | 14 +++++++++++++- dashboard/internet_nl_dashboard/tests/test_mail.py | 3 ++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/dashboard/internet_nl_dashboard/logic/mail.py b/dashboard/internet_nl_dashboard/logic/mail.py index a622e7ac..795cefb6 100644 --- a/dashboard/internet_nl_dashboard/logic/mail.py +++ b/dashboard/internet_nl_dashboard/logic/mail.py @@ -18,6 +18,7 @@ from dashboard.internet_nl_dashboard.logic.report_comparison import (compare_report_in_detail, filter_comparison_report, key_calculation, render_comparison_view) from dashboard.internet_nl_dashboard.models import AccountInternetNLScan, DashboardUser, UrlListReport +from dashboard.settings import LANGUAGES log = logging.getLogger(__package__) @@ -92,9 +93,11 @@ def send_scan_finished_mails(scan: AccountInternetNLScan) -> int: report = UrlListReport.objects.all().filter(id=scan.report.id).order_by("-id").defer('calculation').first() for user in users: + log.debug("Sending finished mail to user %s", user.id) # set unsubscribe code if it's not set yet. This allows the user to instantly unsubscribe from this feed. if user.dashboarduser.mail_after_mail_unsubscribe_code == "": + log.debug("For some reason user %s has no unsubscribe code, generating one now", user.id) user.dashboarduser.mail_after_mail_unsubscribe_code = generate_unsubscribe_code() user.dashboarduser.save() @@ -132,7 +135,7 @@ def send_scan_finished_mails(scan: AccountInternetNLScan) -> int: previous = convert_to_email_safe_values(previous, user.dashboarduser.mail_preferred_language.code.lower()) placeholders = {**placeholders, **previous} - + log.debug("Sending actual finished mail to user %s", user.id) mail.send( sender=config.EMAIL_NOTIFICATION_SENDER, recipients=user.dashboarduser.mail_preferred_mail_address, # List of email addresses also accepted @@ -213,6 +216,15 @@ def values_from_previous_report(report_id: int, previous_report: UrlListReport) def convert_to_email_safe_values(values: dict, mail_language: str = "en") -> dict: + + # in some cases this is not set or defaults to 'af' / afghanistan, which is the first ISO code in the list + # list here: https://en.wikipedia.org/wiki/List_of_ISO_3166_country_codes. Even though we do error handling. + # so we're nice here and try to use a code that we know in case this happens. + # see issue INTERNET-NL-DASHBOARD-68 + if mail_language not in [language_code for language_code, name in LANGUAGES]: + mail_language = 'en' + log.debug("Mail language: %s", mail_language) + return { "previous_report_available": str(values["previous_report_available"]), "previous_report_average_internet_nl_score": values["previous_report_average_internet_nl_score"], diff --git a/dashboard/internet_nl_dashboard/tests/test_mail.py b/dashboard/internet_nl_dashboard/tests/test_mail.py index fc7f9b11..f56b226f 100644 --- a/dashboard/internet_nl_dashboard/tests/test_mail.py +++ b/dashboard/internet_nl_dashboard/tests/test_mail.py @@ -18,7 +18,8 @@ def setup_test(): account = Account(**{'name': 'test'}) account.save() - dashboarduser = DashboardUser(**{'mail_preferred_mail_address': 'info@example.com', 'mail_preferred_language': 'nl', + # use a language that is not supported, so the system will fall back to english... + dashboarduser = DashboardUser(**{'mail_preferred_mail_address': 'info@example.com', 'mail_preferred_language': 'af', 'mail_send_mail_after_scan_finished': True, 'account': account, 'user': user}) dashboarduser.save() From d1c5a63321a43fe5e1d2d05f3fd3c5bd0748c5c5 Mon Sep 17 00:00:00 2001 From: stitch1 Date: Mon, 6 Nov 2023 10:44:56 +0100 Subject: [PATCH 6/7] update websecmap, fix medium vulnerabile dependencies --- requirements-deploy.in | 2 +- requirements-deploy.txt | 14 +++++++------- requirements-dev.in | 2 +- requirements-dev.txt | 27 +++++++++------------------ requirements.in | 4 ++-- requirements.txt | 17 ++++++++--------- 6 files changed, 28 insertions(+), 38 deletions(-) diff --git a/requirements-deploy.in b/requirements-deploy.in index 0d865e56..26fb1268 100644 --- a/requirements-deploy.in +++ b/requirements-deploy.in @@ -3,7 +3,7 @@ # make sure there are no version conflicts with requirements.txt -c requirements.txt -websecmap[deploy] @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@6fcf0110e5809fceb0ddf93b8553804a7cc52151 +websecmap[deploy] @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@d60d90d787d94de122dfe139df4790b82eee7554 # include (security) version constraints for non primary dependencies -c security-constraints.in diff --git a/requirements-deploy.txt b/requirements-deploy.txt index 2859d42d..02b33b38 100644 --- a/requirements-deploy.txt +++ b/requirements-deploy.txt @@ -4,7 +4,7 @@ # # pip-compile --output-file=requirements-deploy.txt requirements-deploy.in # -aiohttp==3.8.4 +aiohttp==3.8.6 # via # -c requirements.txt # geoip2 @@ -97,7 +97,7 @@ colorlog==6.7.0 # via # -c requirements.txt # websecmap -cryptography==41.0.3 +cryptography==41.0.5 # via # -c requirements.txt # -c security-constraints.in @@ -123,7 +123,7 @@ diff-match-patch==20230430 # -c requirements.txt # django-import-export # websecmap -django==4.2.3 +django==4.2.7 # via # -c requirements.txt # -c security-constraints.in @@ -472,7 +472,7 @@ rdp==0.8 # via # -c requirements.txt # websecmap -redis==4.6.0 +redis==5.0.0 # via # -c requirements.txt # celery @@ -569,7 +569,7 @@ tldextract==3.4.4 # via # -c requirements.txt # websecmap -tornado==6.3.2 +tornado==6.3.3 # via # -c requirements.txt # flower @@ -585,7 +585,7 @@ typing-extensions==4.6.3 # asgiref # django-countries # websecmap -urllib3==1.26.6 +urllib3==1.26.18 # via # -c requirements.txt # requests @@ -607,7 +607,7 @@ vine==1.3.0 # celery # flower # websecmap -websecmap[deploy] @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@6fcf0110e5809fceb0ddf93b8553804a7cc52151 +websecmap[deploy] @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@d60d90d787d94de122dfe139df4790b82eee7554 # via # -c requirements.txt # -r requirements-deploy.in diff --git a/requirements-dev.in b/requirements-dev.in index 29209b75..621d4e1b 100644 --- a/requirements-dev.in +++ b/requirements-dev.in @@ -18,7 +18,7 @@ django-extensions django-debug-toolbar django-debug-toolbar-request-history pytest-mock -websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@6fcf0110e5809fceb0ddf93b8553804a7cc52151 +websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@d60d90d787d94de122dfe139df4790b82eee7554 # use the version with relaxed attr requirements pytest-docker @ git+https://github.com/avast/pytest-docker@519b155009b6f3570c01f1f56e7c4e6ce3c5c760 diff --git a/requirements-dev.txt b/requirements-dev.txt index f90c5b56..f80eb2ea 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,9 +2,9 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=requirements-dev.txt requirements-dev.in +# pip-compile --output-file=requirements-dev.txt --resolver=backtracking requirements-dev.in # -aiohttp==3.8.4 +aiohttp==3.8.6 # via # -c requirements.txt # geoip2 @@ -87,7 +87,6 @@ celery-statsd==1.0.0 # websecmap certifi==2023.7.22 # via - # -c requirements.txt # -c security-constraints.in # requests # sentry-sdk @@ -122,9 +121,8 @@ coverage[toml]==7.2.7 # via # -r requirements-dev.in # pytest-cov -cryptography==41.0.3 +cryptography==41.0.5 # via - # -c requirements.txt # -c security-constraints.in # pyopenssl # types-pyopenssl @@ -152,9 +150,8 @@ diff-match-patch==20230430 # websecmap dill==0.3.6 # via pylint -django==4.2.3 +django==4.2.7 # via - # -c requirements.txt # -c security-constraints.in # django-celery-beat # django-colorful @@ -376,7 +373,6 @@ lml==0.1.0 # websecmap lxml==4.9.2 # via - # -c requirements.txt # -c security-constraints.in # dnsrecon # websecmap @@ -472,7 +468,6 @@ phonenumberslite==8.13.15 # via -r requirements-dev.in pillow==9.5.0 # via - # -c requirements.txt # -c security-constraints.in # python-resize-image # reportlab @@ -637,7 +632,7 @@ rdp==0.8 # websecmap recommonmark==0.7.1 # via -r requirements-dev.in -redis==4.6.0 +redis==5.0.0 # via # -c requirements.txt # celery @@ -698,7 +693,6 @@ six==1.16.0 # via # -c requirements.txt # celery-statsd - # django-jet-reboot # livereload # python-dateutil # python-monkey-business @@ -741,7 +735,6 @@ sphinxcontrib-serializinghtml==1.1.5 # via sphinx sqlparse==0.4.4 # via - # -c requirements.txt # -c security-constraints.in # django # django-debug-toolbar @@ -795,7 +788,7 @@ tomli==2.0.1 # pytest tomlkit==0.11.8 # via pylint -tornado==6.3.2 +tornado==6.3.3 # via # -c requirements.txt # flower @@ -838,7 +831,7 @@ typing-extensions==4.6.3 # django-stubs-ext # mypy # websecmap -urllib3==1.26.6 +urllib3==1.26.18 # via # -c requirements.txt # requests @@ -858,10 +851,8 @@ vine==1.3.0 # websecmap vulture==2.7 # via -r requirements-dev.in -websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@6fcf0110e5809fceb0ddf93b8553804a7cc52151 - # via - # -c requirements.txt - # -r requirements-dev.in +websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@d60d90d787d94de122dfe139df4790b82eee7554 + # via -r requirements-dev.in wikidata==0.7.0 # via # -c requirements.txt diff --git a/requirements.in b/requirements.in index c4aefd72..36839d25 100644 --- a/requirements.in +++ b/requirements.in @@ -9,7 +9,7 @@ # - no version pinning, unless it is required and explained # to update websecmap, update the SHA hash and run: make update_requirement_websecmap -websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@6fcf0110e5809fceb0ddf93b8553804a7cc52151 +websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@d60d90d787d94de122dfe139df4790b82eee7554 django-otp django-two-factor-auth>1.15 @@ -49,4 +49,4 @@ phonenumbers # https://github.com/pyexcel/pyexcel-xlsx/issues/52 pyexcel==0.6.7 pyexcel-xlsx==0.6.0 -openpyxl==3.0.10 \ No newline at end of file +openpyxl==3.0.10 diff --git a/requirements.txt b/requirements.txt index dcf9a808..dcd377be 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,9 @@ # This file is autogenerated by pip-compile with Python 3.10 # by the following command: # -# pip-compile --output-file=requirements.txt requirements.in +# pip-compile --output-file=requirements.txt --resolver=backtracking requirements.in # -aiohttp==3.8.4 +aiohttp==3.8.6 # via # geoip2 # openai @@ -74,7 +74,7 @@ charset-normalizer==3.1.0 # websecmap colorlog==6.7.0 # via websecmap -cryptography==41.0.3 +cryptography==41.0.5 # via # -c security-constraints.in # pyopenssl @@ -94,7 +94,7 @@ diff-match-patch==20230430 # via # django-import-export # websecmap -django==4.2.3 +django==4.2.7 # via # -c security-constraints.in # django-activity-stream @@ -374,7 +374,7 @@ qrcode==7.4.2 # via django-two-factor-auth rdp==0.8 # via websecmap -redis==4.6.0 +redis==5.0.0 # via # celery # websecmap @@ -409,7 +409,6 @@ simplejson==3.19.1 six==1.16.0 # via # celery-statsd - # django-jet-reboot # python-dateutil # python-monkey-business # requests-file @@ -446,7 +445,7 @@ texttable==1.6.7 # websecmap tldextract==3.4.4 # via websecmap -tornado==6.3.2 +tornado==6.3.3 # via # flower # websecmap @@ -460,7 +459,7 @@ typing-extensions==4.6.3 # django-countries # qrcode # websecmap -urllib3==1.26.6 +urllib3==1.26.18 # via # requests # sentry-sdk @@ -475,7 +474,7 @@ vine==1.3.0 # celery # flower # websecmap -websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@6fcf0110e5809fceb0ddf93b8553804a7cc52151 +websecmap @ git+https://gitlab.com/internet-cleanup-foundation/web-security-map@d60d90d787d94de122dfe139df4790b82eee7554 # via -r requirements.in wikidata==0.7.0 # via websecmap From 09719be5b68fc521630b0c44de353ff70590d264 Mon Sep 17 00:00:00 2001 From: stitch1 Date: Mon, 6 Nov 2023 10:50:15 +0100 Subject: [PATCH 7/7] fix lint --- .../logic/spreadsheet.py | 28 ++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/dashboard/internet_nl_dashboard/logic/spreadsheet.py b/dashboard/internet_nl_dashboard/logic/spreadsheet.py index 15291c8a..119fef56 100644 --- a/dashboard/internet_nl_dashboard/logic/spreadsheet.py +++ b/dashboard/internet_nl_dashboard/logic/spreadsheet.py @@ -115,6 +115,22 @@ def is_valid_extension(file: str) -> bool: return False +def get_sheet(file: str) -> List: + try: + sheet = p.get_sheet(file_name=file, name_columns_by_row=0) + except XLRDError: + # xlrd.biffh.XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'thisfile' + return [] + except zipfile.BadZipFile: + # the corrupted file in the unit tests + return [] + except Exception as exc: # pylint: disable=broad-except + log.exception(exc) + return [] + + return sheet + + def get_data(file: str) -> Dict[str, Dict[str, Dict[str, list]]]: """ Will return a simple set of data, without too much validation. Deduplicates data per unique category. @@ -133,16 +149,8 @@ def get_data(file: str) -> Dict[str, Dict[str, Dict[str, list]]]: data: Dict[str, Any] = {} - try: - sheet = p.get_sheet(file_name=file, name_columns_by_row=0) - except XLRDError: - # xlrd.biffh.XLRDError: Unsupported format, or corrupt file: Expected BOF record; found b'thisfile' - return data - except zipfile.BadZipFile: - # the corrupted file in the unit tests - return data - except Exception as exc: # pylint: disable=broad-except - log.exception(exc) + sheet = get_sheet(file=file) + if not sheet: return data # Skips the first entry