Skip to content

Commit

Permalink
Python scraping files
Browse files Browse the repository at this point in the history
  • Loading branch information
venvis committed Feb 8, 2024
1 parent 7868c30 commit f1aaaf3
Show file tree
Hide file tree
Showing 3 changed files with 215 additions and 0 deletions.
39 changes: 39 additions & 0 deletions cellar/cellar_extractor/citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import requests
from bs4 import BeautifulSoup

def get_citations_from_celex_id(celex)->list:#Get citations(Celex ID) from a website by providing celex ID in the function upon calling and reutrns a list of citations if exists else it returns an empty list
website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:{celex}").text
parser=BeautifulSoup(website,'lxml')
div=parser.find_all('div',class_="panel-body")
citations=[]
for divs in div:
if divs!=None:
dl=divs.find('dl',class_="NMetadata")
if dl!=None:
dt=dl.find_all('dt')
for dls in dl:
if "cited" in dls.text.lower():


temp=dls.find_all_next('dd')
for dd in temp:
if dd!=None:
li=dd.find_all('li')
for mentions in li:
if mentions!=None:
a=mentions.find('a')
if a!=None:

citations.append(a.text)
# print(a.text)
# print(citations)
filtered=[]
for splits in citations:
if len(splits.split(" "))<2:
filtered.append(splits)

return filtered


sample=get_citations_from_celex_id("61962CJ0026")
print(sample)
137 changes: 137 additions & 0 deletions cellar/cellar_extractor/operative_extractions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import requests
from bs4 import BeautifulSoup
import unittest
# class ECLI():
# ecli:str
# def __init__(self,ecli):
# self.ecli=ecli
class Analyzer():
celex:str
def __init__(self,celex):
self.celex=celex


def html_page_structure_one(self)->list:
website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%{self.celex}&from=EN").text
parser=BeautifulSoup(website,'lxml')
div=parser.find_all('table')
one=[]
for divs in div:
table=divs.find('table')
if table!=None:
p=table.find('p',class_="coj-normal")
span=p.find('span',class_="coj-bold")
if p!=None and span!=None:
print(span.text)
one.append(span.text)
return one


def html_page_structure_two(self)->list:
website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%{self.celex}&from=EN").text
parser=BeautifulSoup(website,'lxml')
p=parser.find_all('p')
two=[]
for para in p:
# print(para)
span=para.find('span')
if span!=None:
# print(span.text)
if "operative" in span.text.lower():
normal=span.find_all_next('p',class_="normal")
for op in normal:
print(op.text)
two.append(op.text)
return two

def structure_three(self)->list:
website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%{self.celex}&from=EN").text
parser=BeautifulSoup(website,'lxml')
table=parser.find_all('table')
three=[]
for tables in table:
interior=tables.find_all('table')
for interiors in interior:
if interiors!=None:
p=interiors.find('p',class_="coj-normal")
span=p.find('span',class_="coj-bold")
if span!=None:

print(span.text)
three.append(span.text)
return three



def structure_four(self)->list:
website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%{self.celex}&from=EN").text
parser=BeautifulSoup(website,'lxml')
p=parser.find_all('p')
four=[]
for para in p:
# print(para)
span=para.find('span')
if span!=None:
# print(span.text)
if "operative" in span.text.lower():
normal=span.find_all_next('table')
for op in normal:
tbody=op.find('tbody')
new_p=tbody.find_all('p',class_="oj-normal")


for subsequent in new_p:
if subsequent!=None:
print(subsequent.text)
four.append(subsequent.text)


return four

def structure_five(self)->list:
website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%{self.celex}&from=EN").text
parser=BeautifulSoup(website,'lxml')
p=parser.find_all('p')
five=[]
for para in p:
# print(para)
span=para.find('span')
if span!=None:
# print(span.text)
if "operative" in span.text.lower():
normal=span.find_all_next('table')
for op in normal:
tbody=op.find('tbody')
new_p=tbody.find_all('p',class_="normal")


for subsequent in new_p:
if subsequent!=None:
print(subsequent.text)
five.append(subsequent.text)


return five

def __call__(self)->list:
one=self.html_page_structure_one()
if len(one)==0:
one=self.html_page_structure_two()
if len(one)==0:
one=self.structure_three()
if len(one)==0:
one=one.structure_four()
if len(one)==0:
one=self.structure_five()
print(one)



pass


instance=Analyzer("3A62018CA0390")
instance()



39 changes: 39 additions & 0 deletions cellar/cellar_extractor/para.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import requests
from bs4 import BeautifulSoup

def get_para_citations_from_celex_id(celex)->list:#Get paragraph citations from a website by providing celex ID in the function upon calling and reutrns a list of citations if exists else it returns an empty list
website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/ALL/?uri=CELEX:{celex}").text
parser=BeautifulSoup(website,'lxml')
div=parser.find_all('div',class_="panel-body")
citations=[]
for divs in div:
if divs!=None:
dl=divs.find('dl',class_="NMetadata")
if dl!=None:
dt=dl.find_all('dt')
for dls in dl:
if "cited" in dls.text.lower():


temp=dls.find_all_next('dd')
for dd in temp:
if dd!=None:
li=dd.find_all('li')
for mentions in li:
if mentions!=None:
if "p" in mentions.text.lower().split(" "):


# print(mentions.text)
citations.append(mentions.text)
# print(a.text)
# print(citations)
filtered=[]
for splits in citations:

filtered.append(splits.split(":")[1])

return filtered

sample=get_para_citations_from_celex_id("61962CJ0026")
print(sample)

0 comments on commit f1aaaf3

Please sign in to comment.