From 7cb83e4c55e0b2b1f3fedcec4b3292043c6d05ee Mon Sep 17 00:00:00 2001 From: venvis <127123047+venvis@users.noreply.github.com> Date: Thu, 21 Mar 2024 12:49:38 +0400 Subject: [PATCH] Delete cellar/cellar_extractor/operative_extraction.py --- .../cellar_extractor/operative_extraction.py | 399 ------------------ 1 file changed, 399 deletions(-) delete mode 100644 cellar/cellar_extractor/operative_extraction.py diff --git a/cellar/cellar_extractor/operative_extraction.py b/cellar/cellar_extractor/operative_extraction.py deleted file mode 100644 index 72f471c..0000000 --- a/cellar/cellar_extractor/operative_extraction.py +++ /dev/null @@ -1,399 +0,0 @@ -import requests -from bs4 import BeautifulSoup -import unittest -from operative_extraction import Analyzer -import csv -import json -class Analyzer(): - """ - This class returns a list of the operative part for a given celex id . Celex id is initialized through a constructor. - """ - celex:str # declare celex as a string - def __init__(self,celex):# Initialize Celex id as a constructor , passed when calling the class - self.celex=celex - - - def html_page_structure_one(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested - table structure . The relevant text lies inside the coj-bold class of the span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('table') # Find all tables tag from the website - one=[] - for divs in div: - table=divs.find('table') # Find each nested table within the table - if table!=None: - p=table.find_all('p',class_="coj-normal") # Find all p under the nested table with the coj-normal class - for x in p: - span=x.find_all('span',class_="coj-bold")# Span class of coj-bold under the p tag - for y in span: - if x!=None and y!=None: - - one.append(y.text)#append text from span onto a list - return one - - - def html_page_structure_two(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - p=parser.find_all('p') - two=[] - for para in p: - - span=para.find('span') - if span!=None: - - if "operative" in span.text.lower(): - normal=span.find_all_next('p',class_="normal") - for op in normal: - - two.append(op.text) - return two - - def structure_three(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a nested - table structure . The relevant text lies inside the coj-bold class of the span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - table=parser.find_all('table') - three=[] - for tables in table: - interior=tables.find_all('table') - for interiors in interior: - if interiors!=None: - p=interiors.find_all('p',class_="coj-normal") - for x in p: - span=x.find_all('span',class_="coj-bold") - for y in span: - if x!=None and y!=None: - - three.append(y.text) - return three - - - - def structure_four(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the p tag which comes after the keyword operative of the previous span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - p=parser.find_all('p') - four=[] - for para in p: - - span=para.find('span') - if span!=None: - - if "operative" in span.text.lower(): - normal=span.find_all_next('table') - for op in normal: - tbody=op.find('tbody') - new_p=tbody.find_all('p',class_="oj-normal") - - - for subsequent in new_p: - if subsequent!=None: - - four.append(subsequent.text) - - - return four - - def structure_five(self)->list: - - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a paragraph - (p) structure . The relevant text lies inside the normal class of the p tag which comes after the keyword operative of the previous span tag. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - p=parser.find_all('p') - five=[] - for para in p: - - span=para.find('span') - if span!=None: - - if "operative" in span.text.lower(): - normal=span.find_all_next('table') - for op in normal: - tbody=op.find('tbody') - new_p=tbody.find_all('p',class_="normal") - - - for subsequent in new_p: - if subsequent!=None: - - five.append(subsequent.text) - - - return five - def structure_six(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a h2 - (header) structure . The relevant text lies inside thee p tag which comes after the keyword operative part of the respective h2 tag. - """ - - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('h2') - six=[] - for h2 in div: - # print(h2.text) - if h2.text=="Operative part": - operatives=h2.find_all_next('p') - for operative in operatives: - - six.append(operative.text) - return six - def structure_seven(self)->list: - """ - This function retreives operative part from documents of the respected celex id's . This function scrapes/parse the operative part from a table - (table) structure . The relevant text lies inside the span tag which comes after the p tag , with the class name=normal. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - div=parser.find_all('table') - seven=[] - for divs in div: - # find tbody within the table - table=divs.find_all('tbody') - for tables in table: - if tables!=None: - # find tr within the tbody - p=tables.find_all('tr') - for x in p: - if x!=None: - # find td within the tr - td=x.find_all('td') - for y in td: - if y!=None: - p=y.find_all('p',class_="normal") - for all in p: - if all!=None: - # find operative part within the span - span=all.find_all('span',class_="bold") - for spans in span: - # APpend it into a list and return the list when the function is called - seven.append(spans.text) - return seven - def structure_eight(self)->list: - """ - This function retreives operative part from documents of the respected celex id's .The text is extracted from the span tag nested inside - the tbody tag.Returns a list as output. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - - tbody=parser.find_all('tbody') - eight=[] - for all in tbody: - if all!=None: - tr=all.find_all('tr') - for trs in tr: - if trs!=None: - - - p=parser.find_all('p',class_="normal") - for paras in p: - if paras!=None: - if "on those grounds" in paras.text.lower(): - - span=paras.find_all_next('span',class_="bold") - for spans in span: - if spans!=None: - eight.append(spans.text) - - - return eight - def structure_nine(self)->list: - """ - This function retreives operative part from documents of the respected celex id's .The operative part is under the bold(b) - tag after the p tag where the keywords "on those grounds" exist. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - nine=[] - div=parser.find_all('p') - for divs in div: - if divs!=None: - if "on those grounds" in divs.text.lower(): - b=divs.find_all_next('b') - for bolds in b: - # print(bolds.text) - nine.append(bolds.text) - return nine - def structure_eleven(self)->list: - """ - This function retreives operative part from documents of the respected celex id's .The operative part is under the paragraph(p) - tag after the b tag where the keywords "operative part" exist. - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - bold = parser.find_all('b') - - eleven=[] - - - - for b in bold: - if b!=None: - if "operative part" in b.text.lower(): - table=b.find_all_next('p') - for tables in table: - if tables!=None: - eleven.append(tables.text) - - - - - return eleven - def structure_ten(self): - """ - This function retreives operative part from documents of the respected celex id's Since the ocntent is preloaded using js/client s - server side functions , the text from the current page is retrieved and the operative part is scraped after the occurence of the phrase - "On those grounds". - """ - website=requests.get(f"https://eur-lex.europa.eu/legal-content/EN/TXT/?uri=CELEX%3A{self.celex}&from=EN").text - parser=BeautifulSoup(website,'lxml') - appender=[] - for string in parser.stripped_strings: - - appender.append(string) - - - found = False - afterGrounds = [] - - for x in appender: - - if "on those grounds" in x.lower(): - found = True - - - if found: - if len(x.split(" "))>3: - afterGrounds.append(x) - return afterGrounds - def __call__(self)->list: - """ - This inbuilt __call__ function loops through all the methods in the class `Analyzer` and returns the list , with values of the operative part . - """ - - container=[self.html_page_structure_one(),self.html_page_structure_two(),self.structure_three(),self.structure_four(),self.structure_five(), - self.structure_six(),self.structure_seven(),self.structure_eight(),self.structure_nine(),self.structure_ten(),self.structure_eleven()] - - - - one:list - for funcs in range(len(container)): - - one=container[funcs] - - if one: - if (len(one)!=0 or one[0]!="\n"): - print("here") - return one - - - - - - - # one=self.html_page_structure_one() - # if len(one)==0 or len(one)=="\n": - # one=self.html_page_structure_two() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_three() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_four() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_five() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_six() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_seven() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_eight() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_nine() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_ten() - # if len(one)==0 or one[0]=="\n": - # one=self.structure_eleven() - - - - - - - -# instance=Analyzer("61962CJ0026") -# x=instance() -# if x!=None: -# print(x) - - -class Writing(): - """ - This class has different methods , for the purpose of writing the operative part into different file formats.(Csv,txt,json) - """ - - instance:str - x:str - parameter:str - def __init__(self, celex:str): - self.celex = celex - self.instance = Analyzer(self.celex) - self.x = self.instance() - - - - def to_csv(self): - file=open("csv/output.csv","a+") - writer=csv.writer(file) - - if self.x!=None: - writer.writerow([self.celex,self.x]) - - def to_json(self): - if self.x!=None: - data={'Celex':self.celex,"Operative part":self.x} - file=open('json/data.json', 'a+') - json.dump(data,file) - file.close() - def to_txt(self): - - - if self.x!=None: - file=open(f"txt/{self.celex}.txt","a") - for w in self.x: - - file.write(w+"\n") - file.close() -#Sample code for reading celex id's froma tsv file - -file=open("gijs_202310_node_list.tsv","r") -reader=csv.reader(file) -from output import Writing -testing=[] -for row in reader: - for rows in row: - if "Id" not in rows: - testing.append(rows.split("\t")[0]) -for all in testing: - instance=Writing(all) - instance.to_csv() - print(all) - -