Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Migration of examples from Python 2 to be compatible with both Python 2 and 3 via the six library. #20

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions ch2/2_2_Basic_webscraping_byusing_beautifulsuop.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ def Get_the_page_by_beautibulsoup():
#print soup()
#print(soup.prettify()) #display source of the html page in readable format.
soup = BeautifulSoup(page.content, 'html.parser')
print soup.find_all('p')[0].get_text()
print soup.find_all('p')[1].get_text()
print soup.find_all('p')[2].get_text()
print soup.find_all('p')[3].get_text()
print(soup.find_all('p')[0].get_text())
print(soup.find_all('p')[1].get_text())
print(soup.find_all('p')[2].get_text())
print(soup.find_all('p')[3].get_text())


if __name__ =="__main__":
Expand Down
34 changes: 17 additions & 17 deletions ch3/3_1_wordsteam.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,32 +21,32 @@

def stemmer_porter():
port = PorterStemmer()
print "\nDerivational Morphemes"
print " ".join([port.stem(i) for i in text6.split()])
print " ".join([port.stem(i) for i in text7.split()])
print "\nInflectional Morphemes"
print " ".join([port.stem(i) for i in text8.split()])
print " ".join([port.stem(i) for i in text9.split()])
print "\nSome examples"
print " ".join([port.stem(i) for i in word.split()])
print " ".join([port.stem(i) for i in text.split()])
print " ".join([port.stem(i) for i in text1.split()])
print " ".join([port.stem(i) for i in text2.split()])
print " ".join([port.stem(i) for i in text3.split()])
print " ".join([port.stem(i) for i in text4.split()])
print " ".join([port.stem(i) for i in text5.split()])
print("\nDerivational Morphemes")
print(" ".join([port.stem(i) for i in text6.split()]))
print(" ".join([port.stem(i) for i in text7.split()]))
print("\nInflectional Morphemes")
print(" ".join([port.stem(i) for i in text8.split()]))
print(" ".join([port.stem(i) for i in text9.split()]))
print("\nSome examples")
print(" ".join([port.stem(i) for i in word.split()]))
print(" ".join([port.stem(i) for i in text.split()]))
print(" ".join([port.stem(i) for i in text1.split()]))
print(" ".join([port.stem(i) for i in text2.split()]))
print(" ".join([port.stem(i) for i in text3.split()]))
print(" ".join([port.stem(i) for i in text4.split()]))
print(" ".join([port.stem(i) for i in text5.split()]))


def polyglot_stem():
print "\nDerivational Morphemes using polyglot library"
print("\nDerivational Morphemes using polyglot library")
for w in words_derv:
w = Word(w, language="en")
print("{:<20}{}".format(w, w.morphemes))
print "\nInflectional Morphemes using polyglot library"
print("\nInflectional Morphemes using polyglot library")
for w in word_infle:
w = Word(w, language="en")
print("{:<20}{}".format(w, w.morphemes))
print "\nSome Morphemes examples using polyglot library"
print("\nSome Morphemes examples using polyglot library")
for w in word_infle:
w = Word(w, language="en")
print("{:<20}{}".format(w, w.morphemes))
Expand Down
24 changes: 12 additions & 12 deletions ch3/3_2_tokenization_lemmatization.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,22 @@ def wordtokenization():
content = """Stemming is funnier than a bummer says the sushi loving computer scientist.
She really wants to buy cars. She told me angrily. It is better for you.
Man is walking. We are meeting tomorrow. You really don't know..!"""
print word_tokenize(content)
print(word_tokenize(content))

def wordlemmatization():
wordlemma = WordNetLemmatizer()
print wordlemma.lemmatize('cars')
print wordlemma.lemmatize('walking',pos='v')
print wordlemma.lemmatize('meeting',pos='n')
print wordlemma.lemmatize('meeting',pos='v')
print wordlemma.lemmatize('better',pos='a')
print wordlemma.lemmatize('is',pos='v')
print wordlemma.lemmatize('funnier',pos='a')
print wordlemma.lemmatize('expected',pos='v')
print wordlemma.lemmatize('fantasized',pos='v')
print(wordlemma.lemmatize('cars'))
print(wordlemma.lemmatize('walking',pos='v'))
print(wordlemma.lemmatize('meeting',pos='n'))
print(wordlemma.lemmatize('meeting',pos='v'))
print(wordlemma.lemmatize('better',pos='a'))
print(wordlemma.lemmatize('is',pos='v'))
print(wordlemma.lemmatize('funnier',pos='a'))
print(wordlemma.lemmatize('expected',pos='v'))
print(wordlemma.lemmatize('fantasized',pos='v'))

if __name__ =="__main__":
wordtokenization()
print "\n"
print "----------Word Lemmatization----------"
print("\n")
print("----------Word Lemmatization----------")
wordlemmatization()
8 changes: 4 additions & 4 deletions ch3/3_3_parsingtree.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def definegrammar_pasrereult():
parser = nltk.ChartParser(Grammar)
trees = parser.parse(sent)
for tree in trees:
print tree
print(tree)

# Part 2: Draw the parse tree
def draw_parser_tree():
Expand All @@ -50,9 +50,9 @@ def stanford_parsing_result():


if __name__ == "__main__":
print "\n--------Parsing result as per defined grammar-------"
print("\n--------Parsing result as per defined grammar-------")
definegrammar_pasrereult()
print "\n--------Drawing Parse Tree-------"
print("\n--------Drawing Parse Tree-------")
draw_parser_tree()
print "\n--------Stanford Parser result------"
print("\n--------Stanford Parser result------")
stanford_parsing_result()
32 changes: 16 additions & 16 deletions ch4/4_1_processrawtext.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,30 +24,30 @@ def readcorpus():
return raw_content_cg[0:1000]

if __name__ == "__main__":
print ""
print "----------Output from Raw Text file-----------"
print ""
print("")
print("----------Output from Raw Text file-----------")
print("")
filecontentdetails = fileread()
print filecontentdetails
print(filecontentdetails)
# sentence tokenizer
st_list_rawfile = st(filecontentdetails)
print len(st_list_rawfile)
print(len(st_list_rawfile))

print ""
print "-------Output from assigned variable-------"
print ""
print("")
print("-------Output from assigned variable-------")
print("")
localveriabledata = localtextvalue()
print localveriabledata
print(localveriabledata)
# sentence tokenizer
st_list_local = st(localveriabledata)
print len(st_list_local)
print st_list_local
print(len(st_list_local))
print(st_list_local)

print ""
print "-------Output Corpus data--------------"
print ""
print("")
print("-------Output Corpus data--------------")
print("")
fromcorpusdata = readcorpus()
print fromcorpusdata
print(fromcorpusdata)
# sentence tokenizer
st_list_corpus = st(fromcorpusdata)
print len(st_list_corpus)
print(len(st_list_corpus))
24 changes: 12 additions & 12 deletions ch4/4_2_rawtext_Stemmers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,28 @@

def stemmer_porter():
port = PorterStemmer()
print "\nStemmer"
print("\nStemmer")
return " ".join([port.stem(i) for i in text.split()])

def lammatizer():
wordnet_lemmatizer = WordNetLemmatizer()
ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
# Pos = verb
print "\nVerb lemma"
print " ".join([wordnet_lemmatizer.lemmatize(i,pos="v") for i in text.split()])
print("\nVerb lemma")
print(" ".join([wordnet_lemmatizer.lemmatize(i,pos="v") for i in text.split()]))
# Pos = noun
print "\nNoun lemma"
print " ".join([wordnet_lemmatizer.lemmatize(i,pos="n") for i in text.split()])
print("\nNoun lemma")
print(" ".join([wordnet_lemmatizer.lemmatize(i,pos="n") for i in text.split()]))
# Pos = Adjective
print "\nAdjective lemma"
print " ".join([wordnet_lemmatizer.lemmatize(i, pos="a") for i in text.split()])
print("\nAdjective lemma")
print(" ".join([wordnet_lemmatizer.lemmatize(i, pos="a") for i in text.split()]))
# Pos = satellite adjectives
print "\nSatellite adjectives lemma"
print " ".join([wordnet_lemmatizer.lemmatize(i, pos="s") for i in text.split()])
print "\nAdverb lemma"
print("\nSatellite adjectives lemma")
print(" ".join([wordnet_lemmatizer.lemmatize(i, pos="s") for i in text.split()]))
print("\nAdverb lemma")
# POS = Adverb
print " ".join([wordnet_lemmatizer.lemmatize(i, pos="r") for i in text.split()])
print(" ".join([wordnet_lemmatizer.lemmatize(i, pos="r") for i in text.split()]))

if __name__ == "__main__":
print stemmer_porter()
print(stemmer_porter())
lammatizer()
18 changes: 9 additions & 9 deletions ch4/4_3_stopwordremove.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,24 @@

def stopwordlist():
stopwordlist = stopwords.words('english')
print ""
print "------List of stop words-------"
print("")
print("------List of stop words-------")
for s in stopwordlist:
print s
print(s)

def customizedstopwordremove():
stop_words = set(["hi", "bye"])
line = """hi this is foo. bye"""
print ""
print "--------Customized stopword removal---------"
print " ".join(word for word in line.split() if word not in stop_words)
print("")
print("--------Customized stopword removal---------")
print(" ".join(word for word in line.split() if word not in stop_words))

def stopwordremove():
stop = set(stopwords.words('english'))
sentence = "this is a test sentence. I am very happy today."
print ""
print "--------Stop word removal from raw text---------"
print " ".join([i for i in sentence.lower().split() if i not in stop])
print("")
print("--------Stop word removal from raw text---------")
print(" ".join([i for i in sentence.lower().split() if i not in stop]))



Expand Down
22 changes: 11 additions & 11 deletions ch4/4_4_wordtokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,25 @@ def wordtokenization():
content = """Stemming is funnier than a bummer says the sushi loving computer scientist.
She really wants to buy cars. She told me angrily. It is better for you.
Man is walking. We are meeting tomorrow. You really don't know..!"""
print word_tokenize(content)
print(word_tokenize(content))

def wordlemmatization():
wordlemma = WordNetLemmatizer()
print wordlemma.lemmatize('cars')
print wordlemma.lemmatize('walking',pos='v')
print wordlemma.lemmatize('meeting',pos='n')
print wordlemma.lemmatize('meeting',pos='v')
print wordlemma.lemmatize('better',pos='a')
print(wordlemma.lemmatize('cars'))
print(wordlemma.lemmatize('walking',pos='v'))
print(wordlemma.lemmatize('meeting',pos='n'))
print(wordlemma.lemmatize('meeting',pos='v'))
print(wordlemma.lemmatize('better',pos='a'))

def wordlowercase():
text= "I am a person. Do you know what is time now?"
print text.lower()
print(text.lower())

if __name__ =="__main__":
wordtokenization()
print "\n"
print "----------Word Lemmatization----------"
print("\n")
print("----------Word Lemmatization----------")
wordlemmatization()
print "\n"
print "----------converting data to lower case ----------"
print("\n")
print("----------converting data to lower case ----------")
wordlowercase()
54 changes: 27 additions & 27 deletions ch4/4_5_regualrexpression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,75 +5,75 @@ def searchvsmatch():

matchObj = re.match(r'animals', line, re.M | re.I)
if matchObj:
print "match: ", matchObj.group()
print("match: ", matchObj.group())
else:
print "No match!!"
print("No match!!")

searchObj = re.search(r'animals', line, re.M | re.I)
if searchObj:
print "search: ", searchObj.group()
print("search: ", searchObj.group())
else:
print "Nothing found!!"
print("Nothing found!!")


def basicregex():
line = "This is test sentence and test sentence is also a sentence."
contactInfo = 'Doe, John: 1111-1212'
print "-----------Output of re.findall()--------"
print("-----------Output of re.findall()--------")
# re.findall() finds all occurences of sentence from line variable.
findallobj = re.findall(r'sentence', line)
print findallobj
print(findallobj)

# re.search() and group wise extraction
groupwiseobj = re.search(r'(\w+), (\w+): (\S+)', contactInfo)
print "\n"
print "-----------Output of Groups--------"
print "1st group ------- " + groupwiseobj.group(1)
print "2nd group ------- " + groupwiseobj.group(2)
print "3rd group ------- " + groupwiseobj.group(3)
print("\n")
print("-----------Output of Groups--------")
print("1st group ------- " + groupwiseobj.group(1))
print("2nd group ------- " + groupwiseobj.group(2))
print("3rd group ------- " + groupwiseobj.group(3))

# re.sub() replace string
phone = "1111-2222-3333 # This is Phone Number"

# Delete Python-style comments
num = re.sub(r'#.*$', "", phone)
print "\n"
print "-----------Output of re.sub()--------"
print "Phone Num : ", num
print("\n")
print("-----------Output of re.sub()--------")
print("Phone Num : ", num)

# Replace John to Peter in contactInfo
contactInforevised = re.sub(r'John', "Peter", contactInfo)
print "Revised contactINFO : ", contactInforevised
print("Revised contactINFO : ", contactInforevised)


def advanceregex():
text = "I play on playground. It is the best ground."

positivelookaheadobjpattern = re.findall(r'play(?=ground)',text,re.M | re.I)
print "Positive lookahead: " + str(positivelookaheadobjpattern)
print("Positive lookahead: " + str(positivelookaheadobjpattern))
positivelookaheadobj = re.search(r'play(?=ground)',text,re.M | re.I)
print "Positive lookahead character index: "+ str(positivelookaheadobj.span())
print("Positive lookahead character index: "+ str(positivelookaheadobj.span()))

possitivelookbehindobjpattern = re.findall(r'(?<=play)ground',text,re.M | re.I)
print "Positive lookbehind: " + str(possitivelookbehindobjpattern)
print("Positive lookbehind: " + str(possitivelookbehindobjpattern))
possitivelookbehindobj = re.search(r'(?<=play)ground',text,re.M | re.I)
print "Positive lookbehind character index: " + str(possitivelookbehindobj.span())
print("Positive lookbehind character index: " + str(possitivelookbehindobj.span()))

negativelookaheadobjpattern = re.findall(r'play(?!ground)', text, re.M | re.I)
print "Negative lookahead: " + str(negativelookaheadobjpattern)
print("Negative lookahead: " + str(negativelookaheadobjpattern))
negativelookaheadobj = re.search(r'play(?!ground)', text, re.M | re.I)
print "Negative lookahead character index: " + str(negativelookaheadobj.span())
print("Negative lookahead character index: " + str(negativelookaheadobj.span()))

negativelookbehindobjpattern = re.findall(r'(?<!play)ground', text, re.M | re.I)
print "negative lookbehind: " + str(negativelookbehindobjpattern)
print("negative lookbehind: " + str(negativelookbehindobjpattern))
negativelookbehindobj = re.search(r'(?<!play)ground', text, re.M | re.I)
print "Negative lookbehind character index: " + str(negativelookbehindobj.span())
print("Negative lookbehind character index: " + str(negativelookbehindobj.span()))

if __name__ == "__main__":
print "\n"
print "---------re.match() vs re.search()"
print("\n")
print("---------re.match() vs re.search()")
searchvsmatch()
print "\n"
print("\n")
basicregex()
print "\n"
print("\n")
advanceregex()
Loading