-
Notifications
You must be signed in to change notification settings - Fork 0
/
proyectos.py
67 lines (55 loc) · 1.7 KB
/
proyectos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import json
import pymongo
import re
import requests
import urllib3
from bs4 import BeautifulSoup
from datetime import datetime
from html.parser import HTMLParser
from pymongo import MongoClient
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
def connection():
client=MongoClient("mongodb://localhost:27017/")
db=client["proyectospy"]
return db
def proyectos():
try:
soup=json.loads(
requests.get(
"http://datos.congreso.gov.py/opendata/api/data/proyecto",
timeout=5,
).text
)
for i in soup:
i['tweet']=False
i['urls']=adjuntos(i['appURL'])
return soup
except requests.ConnectionError:
print("error al conectar")
except Exception as e:
print(e)
def adjuntos(link):
adj=[]
try:
soup = BeautifulSoup(
requests.get(link, timeout=10,
headers={'user-agent': 'Mozilla/5.0'}, verify=False).text, "html.parser")
btn_onlclick_list = [a.get('onclick') for a in soup.find_all('button')]
urls = list(dict.fromkeys(btn_onlclick_list))
for s in urls:
url=re.sub("[\\\\()']", '', s)
url=url.replace("window.open","").replace(",_blank","")
adj.append(url)
return adj
except Exception as e:
print(e)
def write_output():
db=connection()
sorted_list = sorted(proyectos(), key=lambda i: i['idProyecto'])
for i in sorted_list:
try:
db.proyectos.insert_one(i)
db.proyectos.create_index("idProyecto", unique=True)
except pymongo.errors.DuplicateKeyError:
pass
write_output()