-
Notifications
You must be signed in to change notification settings - Fork 0
/
iphin1.py
49 lines (40 loc) · 1.01 KB
/
iphin1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import nltk
import requests
from bs4 import BeautifulSoup
import re
from collections import Counter
import operator
import gc
import pandas as pd
from pandas import read_csv
response = requests.get("http://www.thehindu.com/archive/web/2018/10/14/").text
soup = BeautifulSoup(response,"lxml")
ph_data = pd.read_csv(r"ph_ftrs51dic.csv",usecols=[0])
for link in soup.select("a[href$='.ece']"):
url = link.get('href')
r = requests.get(url)
type(r)
html = r.text
soup = BeautifulSoup(html, "html5lib")
type(soup)
text = soup.get_text()
words = re.findall('\w+', text)
sw = nltk.corpus.stopwords.words('english')
words_ns = []
for word in words:
if word not in sw:
words_ns.append(word.lower())
phword = []
for index, row in ph_data.iterrows():
phword.append(row['english'])
word_freq = []
for s in phword:
n = operator.countOf(words_ns, s)
if n > 0:
word_freq.append([s])
word_freq.append([n])
print(url,' frequency= ',word_freq)
del words_ns
del phword
del word_freq
gc.collect()