diff --git a/InSpy.py b/InSpy.py index 5bb49fd..ff40d9f 100755 --- a/InSpy.py +++ b/InSpy.py @@ -1,24 +1,24 @@ #!/usr/bin/env python2 -# Copyright (c) 2016 Jonathan Broche (@g0jhonny) +# Copyright (c) 2018 Jonathan Broche (@LeapSecurity) -from lib.logger import * -from lib.soupify import * +import argparse, sys, os +from lib.http import * from lib.workbench import * -from lib.crawler import * -import os, argparse, sys, time +from lib.soup import * +from lib.export import * +from lib.logger import * -parser = argparse.ArgumentParser(description='InSpy - A LinkedIn enumeration tool by Jonathan Broche (@g0jhonny)', version="2.0.2") -parser.add_argument('company', help="Company name to use for tasks.") -techgroup = parser.add_argument_group(title="Technology Search") -techgroup.add_argument('--techspy', metavar='file', const="wordlists/tech-list-small.txt", nargs='?', help="Crawl LinkedIn job listings for technologies used by the company. Technologies imported from a new line delimited file. [Default: tech-list-small.txt]") -techgroup.add_argument('--limit', metavar='int', type=int, default=50, help="Limit the number of job listings to crawl. [Default: 50]") -empgroup = parser.add_argument_group(title="Employee Harvesting") -empgroup.add_argument('--empspy', metavar='file', const="wordlists/title-list-small.txt", nargs='?', help="Discover employees by title and/or department. Titles and departments are imported from a new line delimited file. [Default: title-list-small.txt]") -empgroup.add_argument('--emailformat', metavar='string', help="Create email addresses for discovered employees using a known format. [Accepted Formats: first.last@xyz.com, last.first@xyz.com, firstl@xyz.com, lfirst@xyz.com, flast@xyz.com, lastf@xyz.com, first@xyz.com, last@xyz.com]") + +parser = argparse.ArgumentParser(description='InSpy - A LinkedIn enumeration tool by Jonathan Broche (@LeapSecurity)', version="3.0.0") +parser.add_argument('company', help="Company name to use for tasks.") +parser.add_argument('--domain', help="Company domain to use for searching.") +parser.add_argument('--email', help="Email format to create email addresses with. [Accepted Formats: first.last@xyz.com, last.first@xyz.com, firstl@xyz.com, lfirst@xyz.com, flast@xyz.com, lastf@xyz.com, first@xyz.com, last@xyz.com]") +parser.add_argument('--titles', metavar='file', default="wordlists/title-list-small.txt", nargs='?', help="Discover employees by title and/or department. Titles and departments are imported from a new line delimited file. [Default: title-list-small.txt]") outgroup = parser.add_argument_group(title="Output Options") outgroup.add_argument('--html', metavar='file', help="Print results in HTML file.") outgroup.add_argument('--csv', metavar='file', help="Print results in CSV format.") outgroup.add_argument('--json', metavar='file', help="Print results in JSON.") +outgroup.add_argument('--xml', metavar='file', help="Print results in XML.") if len(sys.argv) == 1: parser.print_help() @@ -26,100 +26,57 @@ args = parser.parse_args() start_logger(args.company) +hunterapi = "" #insert hunterio api key here -print "\nInSpy {}\n".format(parser.version) - -if not args.techspy and not args.empspy: - print "You didn't provide any work for me to do." - sys.exit(1) - -stime = time.time() -tech_html, employee_html, tech_csv, employee_csv, tech_json, employee_json = [], [], [], [], [], [] - -if args.techspy: - if os.path.exists(os.path.abspath(args.techspy)): - initial_crawl = crawl_jobs(args.company) - if initial_crawl: - soup = soupify(initial_crawl) - job_links = [] - for link in get_job_links(soup, args.company): - if len(job_links) < args.limit: - job_links.append(link) - if len(job_links) != args.limit: - page_links = get_page_links(soup) - for page in range(len(page_links)): - if len(job_links) == args.limit: break - urlcrawl = crawl_url(page_links[page]) - if urlcrawl: - for link in get_job_links(soupify(urlcrawl), args.company): - if len(job_links) < args.limit: - job_links.append(link) +print "\nInSpy {}".format(parser.version) - pstatus("{} Jobs identified".format(len(job_links))) - if job_links: - techs = {} - for job in range(len(job_links)): - jobresponse = crawl_url(job_links[job]) - if jobresponse: - jobsoup = soupify(jobresponse) - description = get_job_description(jobsoup) - matches = identify_tech(description, os.path.abspath(args.techspy)) - if matches: - title = get_job_title(jobsoup) - techs[title] = {job_links[job]:matches} +if args.domain and not args.email: #search hunterio for email format + domain = args.domain + email = get_email_format(args.domain, hunterapi).replace("{", "").replace("}","") +elif args.email and not args.domain: #search clearbit for domain + email = args.email + domain = get_domain(args.company) +else: #no domain or email provided - fully automate it + domain = get_domain(args.company) + if domain: + email = get_email_format(domain, hunterapi) + if email: email = email.replace("{", "").replace("}","") - tech_html, tech_csv, tech_json = craft_tech(techs) - else: - perror("No such file or directory: '{}'".format(args.techspy)) +if domain and email: + print "\nDomain: {}, Email Format: {}\n".format(domain, email) + employees = {} -if args.empspy: - if os.path.exists(os.path.abspath(args.empspy)): - employees = {} - emails = [] - for response in crawl_employees(args.company, os.path.abspath(args.empspy)): - for name, title in get_employees(soupify(response)).items(): - if args.company.lower() in title.lower(): - if not name in employees: - employees[name] = title + if os.path.exists(os.path.abspath(args.titles)): + for response in search_linkedin(args.company, os.path.abspath(args.titles)): + for name, title in get_employees(soupify(response)).items(): + if args.company.lower() in title.lower(): + if not name in employees: + employees[name] = title + print "\n{} Employees identified".format(len(employees.keys())) + else: + print os.path.abspath(args.titles) + print "No such file or directory: '{}'".format(args.titles) - pstatus("{} Employees identified".format(len(employees.keys()))) - if employees: - if args.emailformat: - if args.emailformat[:args.emailformat.find('@')] in ['first.last', 'last.first', 'firstlast', 'lastfirst', 'first_last', 'last_first', 'first', 'last', 'firstl', 'lfirst', 'flast', 'lastf']: - employee_html, employee_csv, employee_json = craft_employees(employees, args.emailformat) - else: - pwarning("You didn't provide a valid e-mail format. See help (-h) for acceptable formats.") - employee_html, employee_csv, employee_json = craft_employees(employees, None) - else: - employee_html, employee_csv, employee_json = craft_employees(employees, None) - else: - print os.path.abspath(args.empspy) - perror("No such file or directory: '{}'".format(args.empspy)) + if employees: + #output employees + for name, title in employees.iteritems(): + print "{} {}".format(name, title[:50].replace('&', '&')) + + #craft emails + emails = create_emails(employees, domain, email) -#output -if args.html: - if tech_html or employee_html: - if tech_html and employee_html: - craft_html(args.company, tech_html, employee_html, args.html) - elif tech_html and not employee_html: - craft_html(args.company, tech_html, None, args.html) - else: - craft_html(args.company, None, employee_html, args.html) -if args.csv: - if tech_csv or employee_csv: - if tech_csv and employee_csv: - craft_csv(tech_csv, employee_csv, args.csv) - elif tech_csv and not employee_csv: - craft_csv(tech_csv, None, args.csv) - else: - craft_csv(None, employee_csv, args.csv) -if args.json: - if tech_json or employee_json: - if tech_json and employee_json: - craft_json(tech_json, employee_json, args.json) - elif tech_json and not employee_json: - craft_json(tech_json, None, args.json) - else: - craft_json(None, employee_json, args.json) + if emails: + #output emails + print "\nEmails crafted\n".format(len(emails.keys())) + for name, email in emails.items(): + print email -print "Completed in {:.1f}s".format(time.time()-stime) \ No newline at end of file + #export results + if args.html: + output("html", args.html, args.company, domain, employees, emails) + if args.xml: + output("xml", args.xml, args.company, domain, employees, emails) + if args.json: + output("json", args.json, args.company, domain, employees, emails) + if args.csv: + output("csv", args.csv, args.company, domain, employees, emails) \ No newline at end of file diff --git a/LICENSE b/LICENSE index 3e53492..d293ab9 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2016 Jonathan Broche +Copyright (c) 2018 Leap Security Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 0d4ec11..0d92ae7 100644 --- a/README.md +++ b/README.md @@ -3,50 +3,41 @@ ## Introduction ----- -InSpy is a python based LinkedIn enumeration tool. Inspy has two functionalities: TechSpy and EmpSpy. +InSpy is a python based LinkedIn enumeration tool. -- TechSpy - Crawls LinkedIn job listings for technologies used by the provided company. InSpy attempts to identify technologies by matching job descriptions to keywords from a new line delimited file. - -- EmpSpy - Crawls LinkedIn for employees working at the provided company. InSpy searches for employees by title and/or departments from a new line delimited file. InSpy may also create emails for the identified employees if the user specifies an email format. +Version 3.0 introduces the automation of domain and email retrieval in addition to randomized headers and xml output support. ## Installation ----- Run `pip install -r requirements.txt` within the cloned InSpy directory. +Obtain an API key from [HunterIO](https://hunter.io/) and insert it into the hunterio variable within InSpy.py (line 29). + ## Help ----- ``` -InSpy - A LinkedIn enumeration tool by Jonathan Broche (@jonathanbroche) +InSpy - A LinkedIn enumeration tool by Jonathan Broche (@LeapSecurity) positional arguments: - company Company name to use for tasks. + company Company name to use for tasks. optional arguments: - -h, --help show this help message and exit - -v, --version show program's version number and exit - -Technology Search: - --techspy [file] Crawl LinkedIn job listings for technologies used by - the company. Technologies imported from a new line - delimited file. [Default: tech-list-small.txt] - --limit int Limit the number of job listings to crawl. [Default: - 50] - -Employee Harvesting: - --empspy [file] Discover employees by title and/or department. Titles - and departments are imported from a new line delimited - file. [Default: title-list-small.txt] - --emailformat string Create email addresses for discovered employees using - a known format. [Accepted Formats: first.last@xyz.com, - last.first@xyz.com, first_last@xyz.com, last_first@xyz.com, - firstl@xyz.com, lfirst@xyz.com, - flast@xyz.com, lastf@xyz.com, first@xyz.com, - last@xyz.com] + -h, --help show this help message and exit + -v, --version show program's version number and exit + --domain DOMAIN Company domain to use for searching. + --email EMAIL Email format to create email addresses with. [Accepted + Formats: first.last@xyz.com, last.first@xyz.com, + firstl@xyz.com, lfirst@xyz.com, flast@xyz.com, + lastf@xyz.com, first@xyz.com, last@xyz.com] + --titles [file] Discover employees by title and/or department. Titles and + departments are imported from a new line delimited file. + [Default: title-list-small.txt] Output Options: - --html file Print results in HTML file. - --csv file Print results in CSV format. - --json file Print results in JSON. + --html file Print results in HTML file. + --csv file Print results in CSV format. + --json file Print results in JSON. + --xml file Print results in XML. ``` diff --git a/lib/__init__.pyc b/lib/__init__.pyc index e42f2c8..2b69e10 100644 Binary files a/lib/__init__.pyc and b/lib/__init__.pyc differ diff --git a/lib/crawler.py b/lib/crawler.py deleted file mode 100644 index 9a2297c..0000000 --- a/lib/crawler.py +++ /dev/null @@ -1,51 +0,0 @@ -from logger import * -import requests -requests.packages.urllib3.disable_warnings() - -headers={'Host':'www.linkedin.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'} - - -def crawl_employees(company, file): - titles = [] - responses = [] - try: - with open(file) as f: - for title in f.readlines(): - titles.append(title.rstrip()) - for title in titles: - response = requests.get("https://www.linkedin.com/title/{}-at-{}".format(title.replace(' ', '-'), company.replace(' ', '-')), timeout=3, headers=headers) - responses.append(response.text) - except requests.exceptions.Timeout as e: - pwarning("Warning: Timed out crawling {}".format(title)) - except Exception as e: - perror("Error: {}".format(e)) - logging.error(e) - return responses - -def crawl_jobs(company): #initial crawl - url = "https://www.linkedin.com/jobs/{}-jobs".format(company.replace(' ', '-')) - try: - response = requests.get(url, timeout=3, headers=headers) - return response.text - except requests.exceptions.Timeout as e: - perror("Error: Timed out. Try again, LinkedIn doesn't like us sometimes") - logging.error(e) - except requests.exceptions.ReadTimeout as e: - perror("Error: Read time out") - logging.error(e) - except Exception as e: - perror("Error: {}".format(e)) - logging.error(e) - - -def crawl_url(url=None): #page crawls - try: - response = requests.get(url, timeout=3, headers=headers) - return response.text - except requests.exceptions.Timeout as e: - pwarning("Warning: Timed out") - except requests.exceptions.ReadTimeout as e: - pwarning("Warning: Read time out") - except Exception as e: - pwarning("Warning: {}".format(e)) - logging.error(e) \ No newline at end of file diff --git a/lib/crawler.pyc b/lib/crawler.pyc deleted file mode 100644 index fd4bce3..0000000 Binary files a/lib/crawler.pyc and /dev/null differ diff --git a/lib/export.py b/lib/export.py new file mode 100644 index 0000000..3bd7055 --- /dev/null +++ b/lib/export.py @@ -0,0 +1,99 @@ +import json, os, xml.dom.minidom, time +from xml.etree.ElementTree import Element, SubElement, tostring + +def output(format, file, company, domain, employees, emails): + if format == "xml": + oxml(file, company, domain, employees, emails) + if format == "csv": + ocsv(file, company, domain, employees, emails) + if format == "html": + ohtml(file, company, domain, employees, emails) + if format == "json": + ojson(file, company, domain, employees, emails) + +#CSV +def ocsv(filename, company, domain, employees, emails): + with open(os.path.abspath(filename), 'a') as csvfile: + fieldnames = ["Employee Name", "Title", "Email"] + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + for name, title in employees.iteritems(): + writer.writerow({"Employee Name": name, "Title": title.replace('&', '&'), "Email": emails[name]}) + +#JSON +def ojson(file, company, domain, employees, emails): + employee_json = [] + + for name, title in employees.iteritems(): + employee_json.append({"name": name, "title": title.replace('&', '&'), "email": emails[name]}) + + full_json = { + "company": {"name":company, "domain": domain}, + "employees": employee_json + } + + with open(os.path.abspath(file), 'w') as f: + f.write(json.dumps(full_json)) + +#XML +def oxml(file, company, domain, employees, emails): + top = Element('InSpy') + cxml = SubElement(top, 'Company') + + #company name + cnxml = SubElement(cxml, "Name") + cnxml.text = company + #company domain + cdxml = SubElement(cxml, "Domain") + cdxml.text = domain + + echild = SubElement(top, 'Employees') + + for name, title in employees.iteritems(): + + employee = SubElement(echild, "Employee") + #name + nxml = SubElement(employee, "Name") + nxml.text = name + #title + txml = SubElement(employee, "Title") + txml.text = title.replace("&", "&") + #email + exml = SubElement(employee, "Email") + exml.text = emails[name] + + fxml = xml.dom.minidom.parseString(tostring(top)) + + with open(os.path.abspath(file), 'w') as f: + f.write(fxml.toprettyxml()) + +#HTML +def ohtml(file, company, domain, employees, emails): + employee_html = [] + + for name, title in employees.iteritems(): + employee_html.append("{name}{title}{email}".format(name=name, title=title, email=emails[name])) + + page = """ + + InSpy - {company} + + + +

InSpy

+

Company: {company}

Date: {time}

+ + + + + + + {html} +
Employee NameTitleE-mail
+
+ + + """.format(company=company, time=time.strftime("%Y/%m/%d %H:%M:%S"), html=employee_html) + + with open(os.path.abspath(file), 'w') as f: + f.write(page) \ No newline at end of file diff --git a/lib/export.pyc b/lib/export.pyc new file mode 100644 index 0000000..f62f85e Binary files /dev/null and b/lib/export.pyc differ diff --git a/lib/http.py b/lib/http.py new file mode 100644 index 0000000..b949f0a --- /dev/null +++ b/lib/http.py @@ -0,0 +1,33 @@ + +import requests, random +from logger import * +#requests.packages.urllib3.disable_warnings() + +def random_header(): + + agents = ['Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:25.0) Gecko/20100101 Firefox/25.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_8) AppleWebKit/537.13+ (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'] + + return {'User-Agent': random.choice(agents),'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} + +def http_request(url): + + try: + r = requests.get(url, timeout=3, headers=random_header()) + + if r.status_code == 200: + if "linkedin.com" in url: + return {"status": r.status_code, "response": r.text} + else: + return {"status": r.status_code, "response": r.json()} + else: + return {"status": r.status_code, "response": ""} + + except requests.exceptions.Timeout as e: + print "Error: Timed out." + logging.error(e) + except Exception as e: + logging.error(e) \ No newline at end of file diff --git a/lib/http.pyc b/lib/http.pyc new file mode 100644 index 0000000..9e537f3 Binary files /dev/null and b/lib/http.pyc differ diff --git a/lib/logger.py b/lib/logger.py index 76998a5..aa8714e 100644 --- a/lib/logger.py +++ b/lib/logger.py @@ -10,23 +10,4 @@ def start_logger(company): logger.propagate = False logger.addHandler(handler) logger.setLevel(logging.INFO) - logging.getLogger("requests").setLevel(logging.DEBUG) - -class colors(object): - grey = "\033[0;37m" - cyan = "\033[0;36m" - yellow = "\033[0;33m" - red = "\033[1;31m" - normal = "\033[0;00m" - -def pstatus(message): - print "{} {}{}{}".format(time_format, colors.grey, message, colors.normal) - -def presults(message): - print "{} {}{}{}".format(time_format, colors.cyan, message, colors.normal) - -def pwarning(message): - print "{} {}{}{}".format(time_format, colors.yellow, message, colors.normal) - -def perror(message): - print "{} {}{}{}".format(time_format, colors.red, message, colors.normal) \ No newline at end of file + logging.getLogger("requests").setLevel(logging.DEBUG) \ No newline at end of file diff --git a/lib/logger.pyc b/lib/logger.pyc index 01bee32..e0ec5c5 100644 Binary files a/lib/logger.pyc and b/lib/logger.pyc differ diff --git a/lib/soup.py b/lib/soup.py new file mode 100644 index 0000000..1351719 --- /dev/null +++ b/lib/soup.py @@ -0,0 +1,25 @@ +import BeautifulSoup, json + +def soupify(response): + try: + soupd = BeautifulSoup.BeautifulSoup(response) + return soupd + except (AttributeError, TypeError) as e: + pass + except Exception as e: + print "Error: {}".format(e) + +def get_employees(soup): + try: + employees = {} + for n, t in zip(soup.findAll('a', {"class": "professional__name"}), soup.findAll("p", {"class" : "professional__headline"})): + name = n.getText().encode('ascii','ignore') + title = t.getText().encode('ascii','ignore') + if name and title: + employees[name] = title + return employees + except (AttributeError, TypeError) as e: + pass + except Exception as e: + print "Error: {}".format(e) + diff --git a/lib/soup.pyc b/lib/soup.pyc new file mode 100644 index 0000000..305db4d Binary files /dev/null and b/lib/soup.pyc differ diff --git a/lib/soupify.py b/lib/soupify.py deleted file mode 100644 index 4513657..0000000 --- a/lib/soupify.py +++ /dev/null @@ -1,71 +0,0 @@ -from logger import * -import BeautifulSoup, json - -def soupify(response): - try: - soupd = BeautifulSoup.BeautifulSoup(response) - return soupd - except (AttributeError, TypeError) as e: - pass - except Exception as e: - perror("Error: {}".format(e)) - logging.error("Soupify.py Error: {}".format(e)) - -def get_employees(soup): - try: - employees = {} - for n, t in zip(soup.findAll('a', {"class": "professional__name"}), soup.findAll("p", {"class" : "professional__headline"})): - name = u''.join(n.getText()).encode('utf-8') - title = u''.join(t.getText()).encode('utf-8') - if name and title: - employees[name] = title - return employees - except (AttributeError, TypeError) as e: - pass - except Exception as e: - perror("Error: {}".format(e)) - logging.error("Soupify.py Error: {}".format(e)) - -def get_job_links(soup, company): - try: - job_links = [] - for link, comp in zip(soup.findAll('a', { "class" : "job-title" }), soup.findAll('span', { "class" : "company-name-text" })): - if comp.text == company: - job_links.append(u''.join(link['href']).encode('utf-8')) - return job_links - except (AttributeError, TypeError) as e: - pass - except Exception as e: - perror("Error: {}".format(e)) - logging.error("Soupify.py Error: {}".format(e)) - -def get_page_links(soup): - page_links = [] - try: - for page in soup.findAll('li', { "class" : "page-number"}): - a = page.findAll('a') - page_links.append(u''.join("https://linkedin.com{}".format(a[0]['href'])).encode('utf-8')) - return page_links - except (AttributeError, TypeError) as e: - pass - except Exception as e: - perror("Error: {}".format(e)) - logging.error("Soupify.py Error: {}".format(e)) - -def get_job_title(soup): - try: - return u''.join(json.loads(soup.find('code', {"id" : "decoratedJobPostingModule"}).string)['decoratedJobPosting']['jobPosting'].get('title')).encode('utf-8') - except (AttributeError, TypeError) as e: - pass - except Exception as e: - perror("Error: {}".format(e)) - logging.error("Soupify.py Error: {}".format(e)) - -def get_job_description(soup): - try: - return u''.join(json.loads(soup.find('code', {"id" : "decoratedJobPostingModule"}).string)['decoratedJobPosting']['jobPosting']['description'].get('rawText')).encode('utf-8') - except (AttributeError, TypeError): - pass - except Exception as e: - perror("Error: {}".format(e)) - logging.error("Soupify.py Error: {}".format(e)) \ No newline at end of file diff --git a/lib/soupify.pyc b/lib/soupify.pyc deleted file mode 100644 index eb4787e..0000000 Binary files a/lib/soupify.pyc and /dev/null differ diff --git a/lib/workbench.py b/lib/workbench.py index 56dba6b..2c88d28 100644 --- a/lib/workbench.py +++ b/lib/workbench.py @@ -1,93 +1,101 @@ -import re, json, os, csv, time, codecs, HTMLParser +import re, json, HTMLParser, unicodedata +from http import * from logger import * -def identify_tech(data, file): - matches = [] + +def get_domain(company): #Clearbit API - clearbit.com + + clearbit_request = "https://autocomplete.clearbit.com/v1/companies/suggest?query={}".format(company) + clearbit_results = [] + domain = "" + + r = http_request(clearbit_request) + + if len(r["response"]) >=1: + for element in r["response"]: + if company.lower() == element['name'].lower(): + clearbit_results.append({"name" : element['name'], "domain":element['domain']}) + + if len(clearbit_results) == 1: #return domain if one result + domain = clearbit_results[0]["domain"] + elif len(clearbit_results) > 1: #prompt user if multiple domains identified + print "Multiple domains identified for company. Which one is the target?" + for index, result in enumerate(clearbit_results): + print "{}) Name: {}, Domain: {}".format(index, result["name"], result["domain"]) + choice = input() + domain = clearbit_results[choice]["domain"] + + if domain: + return domain + else: + logging.error("Clearbit API - HTTP {} Error".format(r["status"])) + print "InSpy could not identify the domain name. Use --domain." + + +def get_email_format(domain, apikey): #HunterIO API - hunter.io + + hunter_request = "https://api.hunter.io/v2/domain-search?domain={domain}&api_key={api}".format(domain=domain, api=apikey) + emailformat = "" + + r = http_request(hunter_request) + + if r["status"] == 200: + for k,v in r["response"].iteritems(): + if k == 'data': + if v['pattern']: + emailformat = v['pattern'] + logging.info("HunterIO Returned Email Format: {}".format(emailformat)) + else: + logging.error("HunterIO - HTTP {} Error".format(r["status"])) + + if emailformat: + return emailformat + else: + print "InSpy could not identify the email format. Use --email." + +def search_linkedin(company, file): + titles = [] + responses = [] + with open(file) as f: - keywords = f.readlines() - - for sentence in data.lower().split("."): - keyword_found = [] - for keyword in keywords: - if re.findall('\\b{}\\b'.format(re.escape(keyword.rstrip())), re.escape(sentence)): - keyword_found.append(keyword.rstrip()) - if keyword_found: - matches.append({sentence:keyword_found}) - return matches - -def craft_tech(matches): - logging.info(matches) - unique_techs, html_out, csv_out, json_out = [], [], [], [] - for title, link in matches.items(): - techs_per_job = [] - for url in link.keys(): - for data in link.get(url): - for sentence, techs in data.items(): - highlight_techs = sentence - for tech in techs: - if tech not in unique_techs: unique_techs.append(tech) - if tech not in techs_per_job: techs_per_job.append(tech) - highlight_techs = re.sub('\\b{}\\b'.format(tech), '{}'.format(tech), highlight_techs) - html_out.append("{title}{techs}{sentence}".format(title=title,techs=', '.join(techs),sentence=highlight_techs.replace("\xe2\x80\xa2", " * "),url=url)) - csv_out.append({"Job Title": title, "Technologies": ', '.join(techs), "Excerpt": sentence, "URL": url}) - json_out.append({"jobtitle": title, "technologies": ', '.join(techs), "excerpt": sentence, "url": url}) - - pstatus('Title: {}'.format(title)) - presults(', '.join(techs_per_job)) - - if unique_techs: - pstatus("Unique Technologies:") - presults(', '.join(unique_techs)) - - return html_out, csv_out, json_out - -def craft_employees(employees, eformat): - hparser=HTMLParser.HTMLParser() - html_out, csv_out, json_out = [], [], [] - emails = {} - if eformat: - format = eformat[:eformat.find('@')] - domain = eformat[eformat.find('@'):] - - for name in employees.keys(): - try: - first = hparser.unescape([n.split() for n in name.split(',',1)][0][0]) - last = hparser.unescape([n.split() for n in name.split(',',1)][0][-1]) - except UnicodeDecodeError: - first = [n.split() for n in name.split(',',1)][0][0] - last = [n.split() for n in name.split(',',1)][0][-1] - email = "{}{}".format(format_email(format, first.lower(), last.lower()), domain) - if email: - emails[name] = email - - for name, title in employees.items(): - try: - name = hparser.unescape(name) - title = hparser.unescape(title) - except UnicodeDecodeError: + for title in f.readlines(): + titles.append(title.rstrip()) + + for title in titles: + response = http_request("https://www.linkedin.com/title/{}-at-{}".format(title.replace(' ', '-'), company.replace(' ', '-'))) + if response["status"] == 200: + responses.append(response["response"]) + elif response["status"] == 999: #LinkedIn doesn't like InSpy + logging.error("LinkedIn Search - HTTP 999 Error Crawling {}".format(title)) pass - presults("{} {}".format(name, title[:50].replace('&', '&'))) - logging.info("Employees identified: {}".format(employees)) - - #html output - if emails: - html_out.append("{name}{title}{email}".format(name=name, title=title, email=emails.get(name))) - csv_out.append({"Employee Name": name, "Title": title, "Email": emails.get(name)}) - json_out.append({"employeename": name, "title": title, "email": emails.get(name)}) else: - html_out.append("{name}{title}--".format(name=name, title=title)) - csv_out.append({"Employee Name": name, "Title": title, "Email": "--"}) - json_out.append({"employeename": name, "title": title, "email": "--"}) + logging.error("LinkedIn Search - HTTP {} Error Crawling {}".format(response["status"], title)) + pass + return responses - if emails: - pstatus("Emails crafted") - for name, email in emails.items(): - presults(email) +#craft emails +def create_emails(employees, domain, eformat): + hparser=HTMLParser.HTMLParser() + emails = {} - - return html_out, csv_out, json_out + for name in employees.keys(): #split up employee name by first, last name + try: + first = hparser.unescape([n.split() for n in name.split(',',1)][0][0]) + last = hparser.unescape([n.split() for n in name.split(',',1)][0][-1]) + except UnicodeDecodeError: + first = [n.split() for n in name.split(',',1)][0][0] + last = [n.split() for n in name.split(',',1)][0][-1] + + #create emails + email = "{}@{}".format(format_email(eformat, first.lower(), last.lower()), domain) + + if email: + emails[name] = email + if emails: + return emails + def format_email(format, first, last): try: formats = { @@ -106,88 +114,4 @@ def format_email(format, first, last): } return formats[format] except Exception as e: - logging.error(e) - - -def craft_html(company, tech_html, employee_html, filename): - if tech_html: - tech_table = """ -

Technologies Identified

- - - - - - - - {techs} -
Job TitleTechnologiesExcerpt
- """.format(techs=' '.join(tech_html)) - else: tech_table = "" - - if employee_html: - employee_table = """ -

Employees Identified

- - - - - - - {html} -
Employee NameTitleE-mail
- """.format(html=' '.join(employee_html)) - else: employee_table = "" - - page = """ - - InSpy - {company} - - - -

InSpy

-

Company: {company}

Date: {time}

- {tech} - {emp} -
- - - """.format(company=company, time=time.strftime("%Y/%m/%d %H:%M:%S"), tech=tech_table, emp=employee_table) - - with open(os.path.abspath(filename), 'w') as f: - f.write(page) - -def craft_csv(tech_csv, employee_csv, filename): - - if tech_csv: - with open(os.path.abspath(filename), 'w') as csvfile: - fieldnames = ["Job Title", "Technologies", "Excerpt", "URL"] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - for row in tech_csv: - writer.writerow(row) - writer.writerow({}) - - if employee_csv: - with open(os.path.abspath(filename), 'a') as csvfile: - fieldnames = ["Employee Name", "Title", "Email"] - writer = csv.DictWriter(csvfile, fieldnames=fieldnames) - writer.writeheader() - for row in employee_csv: - writer.writerow(row) - -def craft_json(tech_json, employee_json, filename): - if tech_json and employee_json: - tech = {"technologies":tech_json} - emp = {"employees":employee_json} - full_json = tech.copy() - full_json.update(emp) - elif tech_json: - tech = {"technologies":tech_json} - full_json = tech - elif employee_json: - emp = {"employees":employee_json} - full_json = emp - - with open(os.path.abspath(filename), 'w') as f: - f.write(json.dumps(full_json)) + print e \ No newline at end of file diff --git a/lib/workbench.pyc b/lib/workbench.pyc index f4ef9e6..c8fb15d 100644 Binary files a/lib/workbench.pyc and b/lib/workbench.pyc differ diff --git a/requirements.txt b/requirements.txt index bdc7597..8633c01 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ -requests == 2.7.0 -BeautifulSoup == 3.2.1 +requests==2.7.0 + +BeautifulSoup==3.2.1 diff --git a/wordlists/tech-list-large.txt b/wordlists/tech-list-large.txt deleted file mode 100644 index 5c482e8..0000000 --- a/wordlists/tech-list-large.txt +++ /dev/null @@ -1,55 +0,0 @@ -c# -ruby -python -cms -azure -java -javascript -cisco -asa -meraki -apache -iis -sql -mysql -windows -linux -unix -apple -adobe -android -blackberry -broadband -cloud -computing -dropbox -ebay -exchange -postfix -sendmail -encryption -filesharing -microsoft -mobile -oracle -juniper -avaya -software -sunos -as400 -mainframe -bluecoat -siem -intrusion prevention -intrusion detection -ids -ips -web proxy -web filter -antivirus -anti virus -dlp -endpoint detection -mobile security -active directory -vmware \ No newline at end of file diff --git a/wordlists/tech-list-small.txt b/wordlists/tech-list-small.txt deleted file mode 100644 index ac13c8e..0000000 --- a/wordlists/tech-list-small.txt +++ /dev/null @@ -1,10 +0,0 @@ -c# -ruby -python -windows -unix -linux -antivirus -ips -ids -cisco \ No newline at end of file