Skip to content

Commit

Permalink
add pl data
Browse files Browse the repository at this point in the history
  • Loading branch information
paiv committed Dec 26, 2024
1 parent 714183e commit 8d97eeb
Show file tree
Hide file tree
Showing 10 changed files with 4,089 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,5 @@
| fr | [fci-breeds-fr.csv](fci-breeds-fr.csv) | https://www.fci.be/fr/nomenclature/ |
| de | [fci-breeds-de.csv](fci-breeds-de.csv) | https://www.fci.be/de/nomenclature/ |
| es | [fci-breeds-es.csv](fci-breeds-es.csv) | https://www.fci.be/es/nomenclature/ |
| pl | [fci-breeds-pl.csv](fci-breeds-pl.csv) | https://www.zkwp.pl/wzorce.php |
| uk | [fci-breeds-uk.csv](fci-breeds-uk.csv) | https://uku.com.ua/plem_work/breed_fci/ |
109 changes: 109 additions & 0 deletions code/crawler/crawl_pl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env python
import core
import re
from collections import defaultdict
from lxml import html
from pathlib import Path
from urllib.parse import urljoin
from crawl_fci import FciCrawler, FciDumper


class PlParser(core.Parser):

def getcontent(self, request):
return {'url': request.url, 'body': html.fromstring(request.content)}

def items(self, page):
def text(body, xpath):
exslt = {'re': 'http://exslt.org/regular-expressions'}
s = ' '.join([s.strip() for s in body.xpath(xpath, namespaces=exslt)])
if s:
return ' '.join(s.split())

def merge(a, b):
ps, qs = a.split(), b.split()
n = min(len(ps), len(qs))
cs = list()
i = next((i for i,(x,y) in enumerate(zip(ps, qs)) if x != y), n)
if i:
cs.append(' '.join(ps[:i]))
i = next((i for i,(x,y) in enumerate(zip(ps[::-1], qs[::-1])) if x != y), n)
if i:
cs.append(' '.join(ps[-i:]))
res = ' '.join(cs) if cs else a
ps = res.split()
while ps and len(ps[-1]) < 3:
ps.pop()
return ' '.join(ps) if ps else res

group = None
section = None

patch = {
'59': 'Gończy szwajcarski',
'60': 'Gończy szwajcarski krótkonożny',
'97': 'Szpic niemiecki',
}

for group_el in page['body'].xpath('//div[@class = "card"]'):
group = text(group_el, 'descendant::div[@class = "card-header"]/descendant::*/text()')
group = re.split(r'\d+\s*', group, maxsplit=1)[-1]

card, = group_el.xpath('descendant::div[@class = "card-body"]')
names = defaultdict(list)
seen = dict()
urls = dict()

for row in card.xpath('descendant::a'):
href = row.attrib['href']
rid = Path(href).stem
if not rid.isdigit():
raise Exception(f'invalid id {rid!r}')

name = row.text
if (p := patch.get(rid)):
name = p
elif (prev := seen.get(rid)):
name = merge(name, prev)
names[rid].append(name)

seen[rid] = name.strip()
urls[rid] = urljoin(page['url'], href)

for rid, name in seen.items():
item = dict()
item['refid'] = rid
item['url'] = page['url']
item['group'] = group
item['name'] = name
if (irl := urls[rid]).lower().endswith('.pdf'):
item['pdf'] = irl
else:
item['url'] = irl
yield item

def parse(self, item, page):
raise Exception()

def links(self, page):
return list()


def main(args):
craw = FciCrawler(url=args.url, basedir=args.data_dir, parser=PlParser())
if args.reset:
craw.reset()
craw.crawl()


if __name__ == '__main__':
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--reset', action='store_true', help='Reset data')
parser.add_argument('-o', '--data-dir', default='data', help='Data directory')
parser.add_argument('-l', '--language', default='pl', help='Language identifier')
parser.add_argument('url', nargs='?', help='Base URL',
default='https://www.zkwp.pl/wzorce.php')
args = parser.parse_args()
main(args)
1 change: 1 addition & 0 deletions code/genpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def entries():
| <a href="index-fr.html">fr</a>
| <a href="index-de.html">de</a>
| <a href="index-es.html">es</a>
| <a href="index-pl.html">pl</a>
| <a href="index-uk.html">uk</a>
</div>
<p><a href="https://ukrainewar.carrd.co/"><img src="StandWithUkraine.svg" alt="standwithukraine"></a></p>
Expand Down
1 change: 1 addition & 0 deletions docs/index-de.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ <h1>FCI Breeds</h1>
| <a href="index-fr.html">fr</a>
| <a href="index-de.html">de</a>
| <a href="index-es.html">es</a>
| <a href="index-pl.html">pl</a>
| <a href="index-uk.html">uk</a>
</div>
<p><a href="https://ukrainewar.carrd.co/"><img src="StandWithUkraine.svg" alt="standwithukraine"></a></p>
Expand Down
1 change: 1 addition & 0 deletions docs/index-es.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ <h1>FCI Breeds</h1>
| <a href="index-fr.html">fr</a>
| <a href="index-de.html">de</a>
| <a href="index-es.html">es</a>
| <a href="index-pl.html">pl</a>
| <a href="index-uk.html">uk</a>
</div>
<p><a href="https://ukrainewar.carrd.co/"><img src="StandWithUkraine.svg" alt="standwithukraine"></a></p>
Expand Down
1 change: 1 addition & 0 deletions docs/index-fr.html
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ <h1>FCI Breeds</h1>
| <a href="index-fr.html">fr</a>
| <a href="index-de.html">de</a>
| <a href="index-es.html">es</a>
| <a href="index-pl.html">pl</a>
| <a href="index-uk.html">uk</a>
</div>
<p><a href="https://ukrainewar.carrd.co/"><img src="StandWithUkraine.svg" alt="standwithukraine"></a></p>
Expand Down
Loading

0 comments on commit 8d97eeb

Please sign in to comment.