A scalable, modular Python framework for scraping business directories worldwide. Built with Playwright for robust automation and designed for easy expansion to new sites.
📦 Business Directory Scraper
├── 🔧 Core Framework Files
│ ├── scraper_framework.py # Main framework with BaseScraper & configs
│ ├── yellowpages_family_scrapers.py # Template for YellowPages-style sites
│ ├── google_maps_scraper.py # Google Maps specific scraper
│ └── scrape_final.py # Original JustDial scraper (standalone)
│
├── 🧪 Demo & Testing Files
│ ├── final_updated_demo.py # Complete framework demonstration
│ ├── test_updated_selectors.py # Selector validation & testing
│ └── template_demo.py # Template pattern demonstration
│
├── 📊 Data & Utilities
│ ├── score_leads.py # Lead scoring system
│ ├── README.md # This documentation
│ └── *.csv # Output data files
│
└── 🖼️ Debug Files
└── *.png # Debug screenshots
pip install playwright pandas
playwright install chromium
python final_updated_demo.py
# Test YellowPages only
python test_updated_selectors.py
# Test original JustDial scraper
python scrape_final.py
class BaseScraper(ABC):
def build_search_url(self, query: str, location: str) -> str
def extract_business_cards(self, page) -> List
def extract_business_data(self, card, detail_page=None) -> Dict
def scrape(self, query: str, location: str) -> List[Dict]
@dataclass
class SiteConfig:
name: str
base_url: str
search_url_template: str
selectors: Dict[str, str]
pagination_type: str
requires_detail_page: bool = True
rate_limit: float = 0.5
scraper = ScraperFactory.create_scraper('yellowpages_us')
results = scraper.scrape("restaurants", "new york")
Site | Country | Status | Results |
---|---|---|---|
YellowPages.com | 🇺🇸 USA | ✅ Active | 30+ businesses |
Google Maps | 🌍 Global | ✅ Active | 6+ businesses |
JustDial.com | 🇮🇳 India | ✅ Active | 50+ businesses |
Site | Country | Status | Notes |
---|---|---|---|
YellowPages.ca | 🇨🇦 Canada | 🔧 Configured | Needs testing |
YellowPages.com.au | 🇦🇺 Australia | 🔧 Configured | Needs testing |
Yell.com | 🇬🇧 UK | 🔧 Configured | Needs testing |
GelbeSeiten.de | 🇩🇪 Germany | 🔧 Configured | Needs testing |
PagesJaunes.fr | 🇫🇷 France | 🔧 Configured | Needs testing |
If the new site has a similar structure to YellowPages:
# In scraper_framework.py, add to SITE_CONFIGS:
'new_site_name': SiteConfig(
name="NewSite.com (Country)",
base_url="https://www.newsite.com",
search_url_template="https://www.newsite.com/search?q={query}&loc={location}",
selectors={
**YELLOWPAGES_TEMPLATE['selectors'], # Inherit from template
'business_cards': '.listing-item', # Override if different
'name': '.business-title a',
'phone': '.contact-phone'
},
pagination_type='pagination',
requires_detail_page=False,
rate_limit=0.4
)
# In yellowpages_family_scrapers.py:
class NewSiteScraper(BaseYellowPagesScraper):
"""Scraper for NewSite.com"""
def build_search_url(self, query: str, location: str) -> str:
"""Custom URL building if needed"""
return self.config.search_url_template.format(
query=query.replace(' ', '%20'),
location=location.replace(' ', '%20')
)
# Test script
from scraper_framework import SITE_CONFIGS
from yellowpages_family_scrapers import NewSiteScraper
config = SITE_CONFIGS['new_site_name']
scraper = NewSiteScraper(config)
results = scraper.scrape("restaurants", "london")
print(f"Found {len(results)} businesses")
For unique sites that don't follow the YellowPages pattern:
# new_site_scraper.py
from scraper_framework import BaseScraper
class NewSiteScraper(BaseScraper):
def build_search_url(self, query: str, location: str) -> str:
# Custom URL building logic
return f"https://newsite.com/search/{query}/{location}"
def extract_business_cards(self, page) -> list:
# Custom card extraction
page.wait_for_selector('.business-item')
return page.query_selector_all('.business-item')
def extract_business_data(self, card, detail_page=None) -> dict:
# Custom data extraction
name = self.safe_extract(card.query_selector('.title'))
phone = self.safe_extract(card.query_selector('.phone'))
return {
'name': name,
'phone': phone,
'address': '',
'website': '',
'link': ''
}
# In scraper_framework.py, add to ScraperFactory:
elif site_name == 'new_site':
from new_site_scraper import NewSiteScraper
return NewSiteScraper(SITE_CONFIGS[site_name])
- Open the site in Chrome/Edge
- Search for a business type
- Right-click on business cards → "Inspect Element"
- Find CSS selectors for:
- Business card container
- Business name
- Phone number
- Address
- Website link
# Use the debug mode in test_updated_selectors.py
python test_updated_selectors.py
# Choose option 4 for "Debug selectors"
# In scraper_framework.py, update the selectors:
'selectors': {
'business_cards': '.search-result', # Main container
'name': '.business-name a', # Business name link
'phone': '.phone-number', # Phone number
'address': '.business-address', # Full address
'website': '.website-link', # Website URL
'link': '.business-name a' # Detail page link
}
Element | Common Selectors |
---|---|
Business Cards | .listing , .search-result , .business-item , .company-item |
Business Name | .business-name , .company-name , .listing-title , h2 a , h3 a |
Phone | .phone , .telephone , .contact-phone , [href^="tel:"] |
Address | .address , .location , .business-address , .adr |
Website | .website , .url , [href^="http"]:not([href*="yellowpages"]) |
from scraper_framework import multi_site_scrape
# Scrape multiple sites at once
results = multi_site_scrape(
query="italian restaurants",
location="chicago",
sites=['yellowpages_us', 'google_maps']
)
print(f"Total results: {len(results)}")
# Add your new site to the available sites
results = multi_site_scrape(
query="coffee shops",
location="seattle",
sites=['yellowpages_us', 'google_maps', 'new_site_name']
)
All scrapers return a standardized dictionary format:
{
'name': 'Business Name',
'phone': '(555) 123-4567',
'address': '123 Main St, City, State 12345',
'full_address': '123 Main St, City, State 12345',
'website': 'https://business-website.com',
'link': 'https://directory-site.com/business-page',
'rating': '4.5',
'rating_count': '(123)',
'query': 'restaurants',
'location': 'new york',
'source': 'YellowPages.com (USA)'
}
# Test if selectors find elements
python test_updated_selectors.py
# Check data completeness
results = scraper.scrape("test query", "test location")
phone_coverage = sum(1 for r in results if r.get('phone')) / len(results)
print(f"Phone coverage: {phone_coverage:.1%}")
# Enable debug mode for screenshots and detailed logging
scraper = NewSiteScraper(config, debug=True)
# Solution: Test different selectors
selectors_to_try = ['.listing', '.result', '.business', '.company']
for selector in selectors_to_try:
elements = page.query_selector_all(selector)
print(f"{selector}: {len(elements)} found")
# Solution: Check if name selector is working
name_elem = card.query_selector('.business-name')
if not name_elem:
# Try different name selectors
name_elem = card.query_selector('h2 a') or card.query_selector('h3 a')
# Solution: Some sites require clicking to reveal contact info
# Set requires_detail_page=True in SiteConfig
- Use browser developer tools to find correct selectors
- Enable debug mode for screenshots and logging
- Test with simple queries first (e.g., "restaurants")
- Check rate limiting if getting blocked
- Verify user agent is set properly
# Adjust rate limiting per site
SiteConfig(
rate_limit=0.5, # 500ms between requests
# More aggressive sites may need 1.0-2.0 seconds
)
# Set pagination type based on site behavior
pagination_type='infinite_scroll' # For sites that load on scroll
pagination_type='pagination' # For sites with page numbers
pagination_type='load_more' # For sites with "Load More" buttons
The framework includes a lead scoring system:
# Score leads after scraping
from score_leads import score_business_leads
scored_results = score_business_leads(results)
- Follow the template pattern above
- Test thoroughly with different queries
- Update this README with new site info
- Submit with sample results
When reporting issues, include:
- Site name and URL
- Query and location used
- Error messages
- Debug screenshots (if available)
Current Framework Performance:
- ✅ YellowPages: 30+ businesses, 83% phone coverage
- ✅ Google Maps: 6+ businesses, 100% rating coverage
- ✅ JustDial: 50+ businesses, dynamic selector system
- ✅ Template System: 5+ international sites ready
- ✅ Data Quality: 83% phone, 64% website coverage
Ready for production use! 🚀