-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
97 lines (80 loc) · 3.41 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from splinter import Browser
from bs4 import BeautifulSoup as bs
import pandas as pd
from time import sleep
from webdriver_manager.chrome import ChromeDriverManager
#CREATE FUNCTION FOR EACH WEBSITE THAT WE SCRAPE
def init_browser():
executable_path = {'executable_path': ChromeDriverManager().install()}
return Browser('chrome', **executable_path, headless=False)
def scrape():
browser = init_browser()
#create an emptry dictionary to append information as we scrape the web
web_scrape_data = {}
#retrieve news article
url = "https://mars.nasa.gov/news/"
browser.visit(url)
browser.is_element_present_by_css("ul.item_list li.slide", wait_time = 2)
#create a beautiful soup object, parse with html parser
soup = bs(browser.html, "html.parser")
section_element = soup.select_one("ul.item_list li.slide")
news_title = section_element.find("div", class_ = "content_title").get_text()
news_p = section_element.find("div", class_ = "article_teaser_body").get_text()
#store news title and description in a dictionary
web_scrape_data = {"news_title": news_title,
"news_p": news_p}
#JPL Mars Space Images - Featured Image
url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars"
browser.visit(url)
sleep(2)
button_element = browser.find_by_id("full_image")
button_element.click()
browser.is_element_present_by_text("more info", wait_time = 2)
button_more_info = browser.links.find_by_partial_text("more info")
button_more_info.click()
#create a beautiful soup object, parse with html parser
soup = bs(browser.html, "html.parser")
relative_image_url = soup.select_one("figure.lede a img").get("src")
featured_image = f"https://www.jpl.nasa.gov{relative_image_url}"
web_scrape_data["featured_image_url"] = featured_image
#Mars facts
url = "https://space-facts.com/mars/"
df = pd.read_html(url, header = 0)
#select the correct table to convert to html
df= df[0]
#Use Pandas to convert the data to a HTML table string
html = df.to_html()
web_scrape_data["Mars_facts"] = html
#mars hemispheres
url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars"
browser.visit(url)
sleep(2)
html = browser.html
soup = bs(html, "html.parser")
#find number of links to scrape, each hemisphere is under div with class item
results = soup.find_all("div", class_ = "item")
#find number of links to loop through
num_to_scrape = len(results)
print(num_to_scrape)
#create a list to append hemisphere image url information after scraping
hemi_url = []
#loop through range of num_to_scrape
for i in range(len(results)):
#extract image link
image_link = results[i].a["href"]
#click on the image link
browser.find_by_css("img.thumb")[i].click()
#get title, from h2 section
title= browser.find_by_css("h2.title").text
#src from img tag and class wide-image
src = browser.find_by_css("img.wide-image")["src"]
#append the scraped information to hemi_url
hemi_url.append({"title":title, "url":src})
web_scrape_data["Mars_hemispheres"] = hemi_url
#quit the browser
browser.quit()
return web_scrape_data
# if running from command line, show the scraped data results
if __name__ == "__main__":
result = scrape()
print(result)