-
Notifications
You must be signed in to change notification settings - Fork 0
/
extractor.py
55 lines (47 loc) · 2.38 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from bs4 import BeautifulSoup
def extract_text(soup, label):
try:
b_tag = soup.find('b', text=label)
if b_tag:
return b_tag.next_sibling.strip()
except Exception as e:
print(f"Error extracting text for label '{label}': {e}")
return "N/A"
def extract_anchor_text(soup, label):
try:
b_tag = soup.find('b', text=label)
if b_tag:
return b_tag.find_next('a').text.strip()
except Exception as e:
print(f"Error extracting anchor text for label '{label}': {e}")
return "N/A"
def extract_satellite_info(page_source):
soup = BeautifulSoup(page_source, 'html.parser')
satellite_data = {}
satellite_data['Satellite Name'] = extract_text(soup, "Satellite Name:")
satellite_data['Status'] = extract_text(soup, "Status:")
satellite_data['Position'] = extract_text(soup, "Position:")
satellite_data['NORAD'] = extract_anchor_text(soup, "NORAD:")
satellite_data['Cospar number'] = extract_anchor_text(soup, "Cospar number:")
satellite_data['Operator'] = extract_anchor_text(soup, "Operator:")
satellite_data['Launch date'] = extract_text(soup, "Launch date:")
satellite_data['Launch site'] = extract_anchor_text(soup, "Launch site:")
satellite_data['Launch vehicle'] = extract_anchor_text(soup, "Launch vehicle:")
satellite_data['Launch mass (kg)'] = extract_text(soup, "Launch mass (kg):")
satellite_data['Dry mass (kg)'] = extract_text(soup, "Dry mass (kg):")
satellite_data['Manufacturer'] = extract_anchor_text(soup, "Manufacturer:")
satellite_data['Model (bus)'] = extract_anchor_text(soup, "Model (bus):")
satellite_data['Orbit'] = extract_text(soup, "Orbit:")
satellite_data['Expected lifetime'] = extract_text(soup, "Expected lifetime:")
img_tags = soup.find_all('div', class_='footprint_map_picture_container')
base_url = "https://www.satbeams.com"
satellite_data['Image Data'] = []
for img_tag in img_tags:
img = img_tag.find('img')
if img and 'src' in img.attrs:
img_url = base_url + img['src']
img_alt = img.get('alt', 'No Alt Text')
satellite_data['Image Data'].append({"url": img_url, "alt": img_alt})
print(f"Found image: {img_url} with alt text: {img_alt}")
print(f"Extracted data for satellite: {satellite_data.get('Satellite Name', 'Unknown')}")
return satellite_data