Skip to content

Commit bcf7252

Browse files
authored
Merge pull request #124 from openzim/blacklist
Add capability to blacklist some websites and redirect them to library / Github issue
2 parents ed34bd3 + f914d46 commit bcf7252

File tree

12 files changed

+463
-7
lines changed

12 files changed

+463
-7
lines changed

api/src/zimitfrontend/constants.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import json
12
import os
23
import pathlib
34
import random
@@ -16,6 +17,11 @@
1617
),
1718
)
1819

20+
blacklist = json.loads(
21+
(pathlib.Path(__file__).parent / "res/blacklist.json").read_bytes()
22+
)["blacklist"]
23+
logger.info(f"{len(blacklist)} websites are blacklisted")
24+
1925

2026
def _get_int_setting(environment_variable_name: str, default_value: int) -> int:
2127
"""Get environment variable as integer or fallback to default value"""
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
{
2+
"blacklist": [
3+
{
4+
"host": "devdocs.io",
5+
"reason": "already_zimed",
6+
"libraryUrl": "https://library.kiwix.org/#tag=devdocs"
7+
},
8+
{
9+
"host": "gutenberg.org",
10+
"reason": "already_zimed",
11+
"libraryUrl": "https://library.kiwix.org/#tag=gutenberg"
12+
},
13+
{
14+
"host": "freecodecamp.org",
15+
"reason": "already_zimed",
16+
"libraryUrl": "https://library.kiwix.org/#tag=freecodecamp"
17+
},
18+
{
19+
"host": "ifixit.com",
20+
"reason": "already_zimed",
21+
"libraryUrl": "https://library.kiwix.org/#tag=ifixit"
22+
},
23+
{
24+
"host": "khanacademy.org",
25+
"reason": "already_zimed",
26+
"libraryUrl": "https://library.kiwix.org/#tag=khan-academy"
27+
},
28+
{
29+
"host": "africanstorybook.org",
30+
"reason": "already_zimed",
31+
"libraryUrl": "https://library.kiwix.org/#tag=&q=african+storybooks"
32+
},
33+
{
34+
"host": "libretexts.org",
35+
"reason": "already_zimed",
36+
"libraryUrl": "https://library.kiwix.org/#tag=&q=libretexts"
37+
},
38+
{
39+
"host": "phet.colorado.edu",
40+
"reason": "already_zimed",
41+
"libraryUrl": "https://library.kiwix.org/#tag=&q=phet"
42+
},
43+
{
44+
"host": "ted.com",
45+
"reason": "already_zimed",
46+
"libraryUrl": "https://library.kiwix.org/#tag=ted"
47+
},
48+
{
49+
"host": "youtube.com",
50+
"reason": "too_big_partially_already_zimed",
51+
"libraryUrl": "https://library.kiwix.org/#tag=youtube",
52+
"scraperUrl": "https://github.com/openzim/youtube"
53+
},
54+
{
55+
"host": "youtu.be",
56+
"reason": "too_big_partially_already_zimed",
57+
"libraryUrl": "https://library.kiwix.org/#tag=youtube",
58+
"scraperUrl": "https://github.com/openzim/youtube"
59+
},
60+
{
61+
"host": "stackexchange.com",
62+
"reason": "already_zimed",
63+
"libraryUrl": "https://library.kiwix.org/#tag=stack_exchange"
64+
},
65+
{
66+
"host": "stackoverflow.com",
67+
"reason": "already_zimed",
68+
"libraryUrl": "https://library.kiwix.org/#tag=stack_exchange"
69+
},
70+
{
71+
"host": "wikibooks.org",
72+
"reason": "already_zimed",
73+
"libraryUrl": "https://library.kiwix.org/#tag=wikibooks",
74+
"wp1Hint": true
75+
},
76+
{
77+
"host": "wikinews.org",
78+
"reason": "already_zimed",
79+
"libraryUrl": "https://library.kiwix.org/#tag=wikinews",
80+
"wp1Hint": true
81+
},
82+
{
83+
"host": "wikipedia.org",
84+
"reason": "already_zimed",
85+
"libraryUrl": "https://library.kiwix.org/#tag=wikipedia",
86+
"wp1Hint": true
87+
},
88+
{
89+
"host": "wikiquote.org",
90+
"reason": "already_zimed",
91+
"libraryUrl": "https://library.kiwix.org/#tag=wikiquote",
92+
"wp1Hint": true
93+
},
94+
{
95+
"host": "vikidia.org",
96+
"reason": "already_zimed",
97+
"libraryUrl": "https://library.kiwix.org/#tag=vikidia"
98+
},
99+
{
100+
"host": "wikisource.org",
101+
"reason": "already_zimed",
102+
"libraryUrl": "https://library.kiwix.org/#tag=wikisource",
103+
"wp1Hint": true
104+
},
105+
{
106+
"host": "wikiversity.org",
107+
"reason": "already_zimed",
108+
"libraryUrl": "https://library.kiwix.org/#tag=wikiversity",
109+
"wp1Hint": true
110+
},
111+
{
112+
"host": "wikivoyage.org",
113+
"reason": "already_zimed",
114+
"libraryUrl": "https://library.kiwix.org/#tag=wikivoyage",
115+
"wp1Hint": true
116+
},
117+
{
118+
"host": "wiktionary.org",
119+
"reason": "already_zimed",
120+
"libraryUrl": "https://library.kiwix.org/#tag=wiktionary",
121+
"wp1Hint": true
122+
},
123+
{
124+
"host": "reddit.com",
125+
"reason": "scraper_needed",
126+
"githubIssue": "https://github.com/openzim/zim-requests/issues/242"
127+
},
128+
{
129+
"host": "archive.org",
130+
"reason": "scraper_needed",
131+
"githubIssue": "https://github.com/openzim/zim-requests/issues/360"
132+
},
133+
{ "host": "quora.com", "reason": "not_possible_with_zimit" },
134+
{ "host": "download.kiwix.org", "reason": "not_possible_with_zimit" },
135+
{ "host": "google.com", "reason": "not_possible_with_zimit" },
136+
{ "host": "library.kiwix.org", "reason": "not_possible_with_zimit" },
137+
{ "host": "wikileaks.org", "reason": "not_possible_with_zimit" },
138+
{ "host": "minecraft.net", "reason": "not_possible_with_zimit" },
139+
{ "host": "github.com", "reason": "not_possible_with_zimit" },
140+
{ "host": "zimit.kiwix.org", "reason": "not_possible_with_zimit" },
141+
{ "host": "coursera.org", "reason": "not_possible_with_zimit" },
142+
{ "host": "facebook.com", "reason": "not_possible_with_zimit" }
143+
]
144+
}

api/src/zimitfrontend/routes/requests.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from fastapi import APIRouter, HTTPException, Path, Request
77

8-
from zimitfrontend.constants import ApiConfiguration, logger
8+
from zimitfrontend.constants import ApiConfiguration, blacklist, logger
99
from zimitfrontend.routes.schemas import (
1010
TaskCancelRequest,
1111
TaskCreateRequest,
@@ -86,6 +86,20 @@ def create_task(
8686

8787
url = urllib.parse.urlparse(request.url)
8888

89+
matching_blacklist_entries = [
90+
blacklist_entry
91+
for blacklist_entry in blacklist
92+
if blacklist_entry["host"].lower() in url.geturl().lower()
93+
]
94+
matching_blacklist_entry = (
95+
matching_blacklist_entries[0] if matching_blacklist_entries else None
96+
)
97+
if matching_blacklist_entry:
98+
raise HTTPException(
99+
HTTPStatus.BAD_REQUEST,
100+
detail={"error": "blacklisted", "blacklist": matching_blacklist_entry},
101+
)
102+
89103
# generate schedule name
90104
ident = str(uuid.uuid4())[:8]
91105
schedule_name = f"{url.hostname}_{ident}"

dev/zimit_ui_dev/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
"kiwix_download_page": "https://kiwix.org/en/applications/",
88
"kiwix_contact_us": "https://kiwix.org/en/contact-us/",
99
"report_issues_page": "https://github.com/openzim/zimit/issues/",
10-
"home_page": "https://zimit.kiwix.org/",
10+
"home_page": "http://localhost:8001/",
1111
"zim_download_url": "https://s3.us-west-1.wasabisys.com/org-kiwix-zimit/zim",
1212
"new_request_advanced_flags": [
1313
"lang",

locales/en.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,5 +113,31 @@
113113
"goToTask": "Go to ongoing task {taskLink}",
114114
"excessiveUsage": "We have detect excessive usage from your environment, please come back in few hours.",
115115
"abnormalUsage": "Your address has been temporarily blocked from posting additional requests because of what our servers perceived as abnormal usage: {status}"
116+
},
117+
"blacklist": {
118+
"missingReason": "Blacklist reason must be set",
119+
"missingLibraryUrl": "Library URL must be set in blacklist reason",
120+
"missingGithubIssueUrl": "Github Issue URL must be set in blacklist reason",
121+
"goBack": "Go back",
122+
"contactUs": "Should you need a special ZIM, please contact us.",
123+
"alreadyZimed": {
124+
"alreadyMadeZim": "Kiwix has already made ZIM(s) of this website.",
125+
"downloadFromLibrary": " To save our resources, we invite you to download your ZIM from {link}.",
126+
"downloadFromLibraryLinkContent": "our library",
127+
"wp1Hint": "If you want to ZIM only few specific pages, you might be interested by {wp1Link}.",
128+
"wp1LinkContent": "our WP1 tool"
129+
},
130+
"tooBig": {
131+
"tooBigDetails": "This website is way too big to make it into a single ZIM.",
132+
"alreadyMadeZim": "Kiwix has already made few ZIM(s) of portions of this website.",
133+
"downloadOrRequest": "You can check these ZIMs in {libraryLink} or open an issue to {githubRequestLink} if it matches Kiwix purpose.",
134+
"libraryLinkContent": "our library",
135+
"githubRequestLinkContent": "request another ZIM",
136+
"useScraper": "Should you be a bit tech-savvy, you can also use {scraperRepoLink} on your own.",
137+
"scraperRepoLinkContent": "our dedicated scraper"
138+
},
139+
"notPossible": "It is unfortunately not possible to ZIM this website with zimit.",
140+
"scraperNeeded": "It seems possible to develop a custom scraper for this website, but we need your support on {githubIssueLink}.",
141+
"scraperNeededLinkContent": "this Github issue"
116142
}
117143
}

locales/qqq.json

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,5 +118,31 @@
118118
"goToTask": "This is the message inviting to open ongoing task",
119119
"excessiveUsage": "This is a more generic message about quota being reached",
120120
"abnormalUsage": "This is a generic error message when we fail to find quota information"
121+
},
122+
"blacklist": {
123+
"missingReason": "This is the error message when the blacklist reason is missconfigured (bug)",
124+
"missingLibraryUrl": "This is the error message when the library URL is missing in blacklist reason (bug)",
125+
"missingGithubIssueUrl": "This is the error message when the Github issue URL is missing in blacklist reason (bug)",
126+
"goBack": "This is the text on the go-back button",
127+
"contactUs": "This is the message inviting users to contact us",
128+
"alreadyZimed": {
129+
"alreadyMadeZim": "This is the message indicating that ZIM has already been built",
130+
"downloadFromLibrary": "This is the message indicating that ZIM can be downloaded",
131+
"downloadFromLibraryLinkContent": "This is the textual content of the download link",
132+
"wp1Hint": "This is a message inviting users to test our WP1 tool",
133+
"wp1LinkContent": "This is the textual content of the link to WP1"
134+
},
135+
"tooBig": {
136+
"tooBigDetails": "This is the message indicating that website is too big",
137+
"alreadyMadeZim": "This is the message indicating that few ZIMs have already been built",
138+
"downloadOrRequest": "This is the message inviting user to download ZIM or open a request",
139+
"libraryLinkContent": "This is the textual content of the download link",
140+
"githubRequestLinkContent": "This is the textual content of the Github request link",
141+
"useScraper": "This is the message inviting user to use our scraper",
142+
"scraperRepoLinkContent": "This is the textual content of the link to the scraper"
143+
},
144+
"notPossible": "This is the message indicating that it is not possible to ZIM this website",
145+
"scraperNeeded": "This is the message indicating that we intend to build a dedicated scraper",
146+
"scraperNeededLinkContent": "This is the textual content of the link to the dedicated Github issue"
121147
}
122148
}
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
<script setup lang="ts">
2+
import { useMainStore } from '../stores/main'
3+
const mainStore = useMainStore()
4+
5+
const close = function () {
6+
mainStore.blacklistReason = undefined
7+
}
8+
</script>
9+
10+
<template>
11+
<div class="main">
12+
<div v-if="!mainStore.blacklistReason">{{ $t('blacklist.missingReason') }}</div>
13+
<div v-else-if="!mainStore.blacklistReason.libraryUrl">
14+
{{ $t('blacklist.missingLibraryUrl') }}
15+
</div>
16+
<div v-else>
17+
<p>{{ $t('blacklist.alreadyZimed.alreadyMadeZim') }}</p>
18+
<i18n-t keypath="blacklist.alreadyZimed.downloadFromLibrary" tag="p">
19+
<template #link>
20+
<a :href="mainStore.blacklistReason.libraryUrl" target="_blank">{{
21+
$t('blacklist.alreadyZimed.downloadFromLibraryLinkContent')
22+
}}</a>
23+
</template>
24+
</i18n-t>
25+
<i18n-t
26+
v-if="mainStore.blacklistReason.wp1Hint"
27+
keypath="blacklist.alreadyZimed.wp1Hint"
28+
tag="p"
29+
>
30+
<template #wp1Link>
31+
<a href="https://wp1.openzim.org/#/selections/simple" target="_blank">{{
32+
$t('blacklist.alreadyZimed.wp1LinkContent')
33+
}}</a>
34+
</template>
35+
</i18n-t>
36+
<p>{{ $t('blacklist.contactUs') }}</p>
37+
</div>
38+
<v-btn class="black" rounded="xl" @click="close">{{ $t('blacklist.goBack') }}</v-btn>
39+
</div>
40+
</template>
41+
42+
<style type="text/css" scoped>
43+
.v-btn {
44+
text-transform: none;
45+
background-color: transparent;
46+
}
47+
48+
.v-btn.black {
49+
background-color: black;
50+
color: white;
51+
}
52+
53+
p {
54+
margin-bottom: 1rem;
55+
}
56+
57+
.main {
58+
text-align: center;
59+
}
60+
</style>
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
<script setup lang="ts">
2+
import { useMainStore } from '../stores/main'
3+
const mainStore = useMainStore()
4+
5+
const close = function () {
6+
mainStore.blacklistReason = undefined
7+
}
8+
</script>
9+
10+
<template>
11+
<div class="main">
12+
<div v-if="!mainStore.blacklistReason">{{ $t('blacklist.missingReason') }}</div>
13+
<div v-else>
14+
<p>{{ $t('blacklist.notPossible') }}</p>
15+
<p>{{ $t('blacklist.contactUs') }}</p>
16+
</div>
17+
<v-btn class="black" rounded="xl" @click="close">{{ $t('blacklist.goBack') }}</v-btn>
18+
</div>
19+
</template>
20+
21+
<style type="text/css" scoped>
22+
.v-btn {
23+
text-transform: none;
24+
background-color: transparent;
25+
}
26+
27+
.v-btn.black {
28+
background-color: black;
29+
color: white;
30+
}
31+
32+
p {
33+
margin-bottom: 1rem;
34+
}
35+
36+
.main {
37+
text-align: center;
38+
}
39+
</style>

0 commit comments

Comments
 (0)