-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_mars.py
132 lines (98 loc) · 4.26 KB
/
scrape_mars.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
# coding: utf-8
# Dependencies
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
from splinter import Browser
import time
# Create function to open the browser
def init_browser():
executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
return Browser('chrome', **executable_path, headless=False)
# Create a dictionary to hold everything pulled from all the sites
scraped_data = {}
# Create function 'scrape' to pull datas
def scrape():
browser = init_browser()
# Site 1
url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest"
browser.visit(url)
# Create a Beautiful Soup object
html = browser.html
soup = bs(html, 'html.parser')
# Get the news title and store it in the dictionnary
news_title = soup.find('div', class_='content_title').text
scraped_data['news_title'] = news_title
# Get the news text and store it in the dictionnary
news_p = soup.find('div', class_='article_teaser_body').text
scraped_data['news_p'] = news_p
# Site 2 : Get the featured image and description
url = 'https://www.jpl.nasa.gov/spaceimages/'
browser.visit(url)
# Create a Beautiful Soup object
html = browser.html
soup = bs(html, 'html.parser')
# Get the featured image and the description
results = soup.find('article')['style']
scraped_data['description'] = soup.find('article')['alt']
relative_image_path = (results.split("'"))[1]
url = 'https://www.jpl.nasa.gov'
featured_image_url = url + relative_image_path
# Store the image in the dictionnary
scraped_data['featured_image_url'] = featured_image_url
# Site 3 - Twitter - Get last weather tweet
url = 'https://twitter.com/marswxreport?lang=en'
browser.visit(url)
# Grab the last tweet
html = browser.html
soup = bs(html, 'html.parser')
# Store the text in the dictionnary
scraped_data['mars_weather'] = soup.find('a', class_= "twitter-timeline-link u-hidden").previousSibling
# Site 4 - Get the table
facts_url = 'https://space-facts.com/mars/'
# use pandas to parse the table
facts_df = pd.read_html(facts_url)[1]
# Rename columns
facts_df.columns = ['Description', 'Value']
# Set index on 'Description'
facts_df.set_index('Description', inplace=True)
# Convert the table to html
html_table = facts_df.to_html()
# Replace automatic strings in order to format the table in index.html
html_table = html_table.replace('\n', '').replace('<th></th> <th>Value</th>', '<th>Description</th> <th>Value</th>')
html_table = html_table.replace('<th>Description</th> <th></th>', ' ')
html_table = html_table.replace('<table border="1" class="dataframe"> <thead> <tr style="text-align: right;">', ' ')
# Store the table in the dictionnary
scraped_data['table'] = html_table
# Site 5 -
url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars'
browser.visit(url)
# grab the links for the hemispheres
html = browser.html
soup = bs(html, 'html.parser')
soup_links = soup.find_all('div', class_="description")
# Create an empty list to store the dictionnaries with 'title' and 'img_url'
hemisphere_image_urls = []
# Create an empty dictionnary to store 'title' and 'img_url'
dict_hemi = {}
# Loop through the four links
for link in soup_links:
url_hemisphere = 'https://astrogeology.usgs.gov' + link.find('a')['href']
# Open the browser for each link
browser.visit(url_hemisphere)
# Let 1sec after opening the new page
time.sleep(1)
html = browser.html
soup = bs(html, 'html.parser')
title = soup.find('h2', class_='title').text
img_url = soup.find('img', class_='wide-image')['src']
dict_hemi["title"]= title
dict_hemi["img_url"]= 'https://astrogeology.usgs.gov' + img_url
hemisphere_image_urls.append(dict_hemi.copy())
# Store the list of dictionnaries in the main dictionnary
scraped_data['hemispheres'] = hemisphere_image_urls
# Close the browser after scraping
browser.quit()
# Return results
return scraped_data