Skip to content

Commit

Permalink
Merge pull request #122 from nelsonic/fix-scrapers-#120
Browse files Browse the repository at this point in the history
Chore: Fix Scrapers #120
  • Loading branch information
nelsonic authored Jan 8, 2024
2 parents b4a9e4f + b96a8ec commit d64fde5
Show file tree
Hide file tree
Showing 26 changed files with 8,038 additions and 141 deletions.
8 changes: 8 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
version: 2
updates:
- package-ecosystem: npm
directory: "/"
schedule:
interval: weekly
time: "17:00"
timezone: Europe/London
33 changes: 33 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# This workflow will do a clean install of node dependencies, cache/restore them, build the source code and run tests across different versions of node
# For more information see: https://help.github.com/actions/language-and-framework-guides/using-nodejs-with-github-actions

name: Node.js CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build:

runs-on: ubuntu-latest

strategy:
matrix:
node-version: [18.x, 20.x]
# See supported Node.js release schedule at https://nodejs.org/en/about/releases/

steps:
- uses: actions/checkout@v2
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v2
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
- run: npm ci
# - run: npm run build --if-present
- run: npm test
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v1
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,3 @@ node_modules
.vagrant
crawl.js
.DS_Store
package-lock.json
7 changes: 0 additions & 7 deletions .travis.yml

This file was deleted.

4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
Learn how to parse the DOM of a web page
by using your favourite coding community as an example.

[![Build Status](https://img.shields.io/travis/nelsonic/github-scraper/master.svg?style=flat-square)](https://travis-ci.org/nelsonic/github-scraper)
[![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/nelsonic/github-scraper/ci.yml?label=build&style=flat-square&branch=main)](https://github.com/nelsonic/github-scraper/actions)
[![codecov.io](https://img.shields.io/codecov/c/github/nelsonic/github-scraper/master.svg?style=flat-square)](http://codecov.io/github/nelsonic/github-scraper?branch=master)
[![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat-square)](https://github.com/nelsonic/github-scraper/issues)
[![HitCount](http://hits.dwyl.com/nelsonic/github-scraper.svg)](http://hits.dwyl.com/nelsonic/github-scraper)
[![HitCount](https://hits.dwyl.com/nelsonic/github-scraper.svg)](https://hits.dwyl.com/nelsonic/github-scraper)
[![npm package version](https://img.shields.io/npm/v/github-scraper.svg?color=brightgreen&style=flat-square)](https://www.npmjs.com/package/github-scraper)
<!-- uncomment when service is working ...
[![Dependencies: None!](https://david-dm.org/nelsonic/github-scraper/status.svg?style=flat-square)](https://david-dm.org/nelsonic/github-scraper)
Expand Down
2 changes: 1 addition & 1 deletion config/repos.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ const SELECTORS={
COMMIT:".Box-header--blue strong",
LANGUAGES:".BorderGrid--spacious .BorderGrid-row",
FORKED_FROM:'a[data-hovercard-type="repository"]',
FOLLOWERS:'.border-gray-light',
FOLLOWERS:'.Layout-main .d-table',
TOPIC_TAG:".topic-tag",
PROFILE:'div[itemtype="http://schema.org/Person"]'
}
Expand Down
15 changes: 7 additions & 8 deletions lib/followers.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,15 @@ module.exports = function followers ($, url, callback) {
var data = { entries : [], url: url};
data.type = url.match(/tab=following/) ? 'following' : 'followers';

// console.log('selectors.FOLLOWERS', selectors.FOLLOWERS);
$(`${selectors.FOLLOWERS}`).each(function(i, el){

$(`${selectors.FOLLOWERS}`).each(function(i,el){

data.entries.push({
avatar:$(this).find('a').first().find('img').first().attr("src"),
fullname:$(this).find('a span').first().text(),
username:$(this).find('a span').last().text()
data.entries.push({
avatar: $(this).find('img.avatar-user').first().attr("src"),
fullname: $(this).find('.Link--primary').first().text(),
username: $(this).find('.Link--secondary').first().text()
})
})
})


data = require('./next_page')($, data); // don't worry require is cached ;-)
callback(null, data)
Expand Down
14 changes: 5 additions & 9 deletions lib/issue.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,14 @@ module.exports = function issue($, url, callback) {

// labels
$('.IssueLabel').each(function(){
data.labels.push($(this).attr('title'));
data.labels.push($(this).attr('data-name'));
})
data.labels = data.labels.filter(function(i) { return i != null });
// stackoverflow.com/questions/9229645/remove-duplicates-from-js-array
data.labels = [...new Set(data.labels)]

var milestone = $('.milestone-name')
if(milestone.length > 0){
data.milestone = milestone[0].attribs.title;
}
var assignee = $('.assignee');
if(assignee.length > 0){
data.assignee = assignee.text().trim();
}
data.milestone = $('.Progress').next().text().trim();
data.assignee = $('.assignee').text().trim();

// participants anyone who has commented or been assigned in the issue
$('.participant-avatar').each(function(){
Expand Down
3 changes: 3 additions & 0 deletions lib/next_page.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@ module.exports = function next_page ($, data) {
if(next) {
data.next_page = next
}
else {
data.next_page = ''
}

return data;
}
14 changes: 8 additions & 6 deletions lib/org.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@
*/
function org($, url, callback) {
var data = { url: url, type: 'org' };
data.name = $('.org-name').first().text().trim();
data.description = $('h1').parent().next().text().trim(); // yep ...¯\_(ツ)_/¯
data.name = $('h1.lh-condensed').first().text().trim();
// data.description = $('h1.lh-condensed').parent().next().text().trim(); // yep ...¯\_(ツ)_/¯
data.description = $('.container-xl .color-fg-muted').first().text().trim()
if($('span[itemprop=location]').length > 0){
data.location = $('span[itemprop=location]')[0].attribs.title;
data.location = $('span[itemprop=location]').first().text().trim();
}
if($('.octicon-link').length > 0){
// console.log($('.octicon-link'));
Expand All @@ -23,15 +24,16 @@ function org($, url, callback) {
// var people = $('.Counter').eq(1); // people is *second* in list of tabs!
// data.pcount = parseInt(people.first().text(), 10);
// data.pcount = isNaN(data.pcount) ? 0 : data.pcount
data.avatar = $('.org-header-wrapper img')[0].attribs.src;
data.avatar = $('.avatar')[0].attribs.src;
var parts = data.avatar.split('/');
data.uid = parseInt(parts[parts.length-1].split('?')[0], 10);
// list of repos
var items = $('li[itemprop="owns"]');
var items = $('li.Box-row');
// console.log('items.length', items.length);
data.entries = []; // avoid having circular reference objects! :-(
items.each( function (i) { // JS counters start at 0.
var parent = 'li[itemprop="owns"]:nth-child(' + (i+1) +') '; // CSS selectors start at 1.
var parent = 'li.Box-row:nth-child(' + (i+1) +') '; // CSS selectors start at 1.
// console.log($(parent))
data.entries.push({
name: $(parent + ' a').first().text().trim(),
lang: $(parent + 'span[itemprop=programmingLanguage]').first().text().trim(),
Expand Down
45 changes: 21 additions & 24 deletions lib/profile.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,11 @@ module.exports = function profile ($, url, callback) {
var parts = data.avatar.split('/');
data.uid = parseInt(parts[parts.length-1].split('?')[0], 10);


$(`${selectors.PROFILE}`).first().find('.js-profile-editable-area a.link-gray .text-bold')
.each(function(i,el){
var stat=$(this).text()
if (stat.indexOf('k') > -1) {
stat = parseFloat(stat.replace('k', ''), 10) * 1000;
} else {
stat = parseInt(stat, 10);
}
tmpData.push(stat)
})

// Profile Stats (Navigation)
$('.Counter').each(function (i,el) {
var stat = $(this).text();
stats.push(stat)
});
data.repos = stats[0];
data.projects = stats[1];
data.stars = tmpData[2]; // number of repositories user has starred
data.followers = tmpData[0]; // number of people folling this user
data.following = tmpData[1]; // number of people this user is following
data.repos = k_to_int($('.UnderlineNav .octicon-repo').first().next().text().trim());
data.projects = k_to_int($('.octicon-table').first().next().text().trim());
data.stars = k_to_int($('.octicon-star').next().text().trim()); // number of repositories user has starred
data.followers = k_to_int($('.js-profile-editable-area .color-fg-default').first().text().trim());
data.following = k_to_int($('.js-profile-editable-area .color-fg-default').eq(1).text().trim());

// Pinned Repos

Expand Down Expand Up @@ -80,12 +63,26 @@ var stat=$(this).text()
})

// GitHub Developer Program member?
var member = $('.bg-purple').text().trim();
var member = $('.octicon-cpu').parent().text().trim();
// yes this is always on the page but the hide it using CSS!! :-(
var display = $('.bg-purple').parent().hasClass('d-none');
if(member && member === 'Pro' && !display) {
if(member && !display) {
data.developerprogram = true;
}
callback(null, data);
// add task to arana to scrape /{username}?tab=repositories after profile!
}

// transform '3.4k' to 3400
function k_to_int(val) {
// if (val === undefined) {
// return 0;
// }
if (val.indexOf("k") > -1) {
val = val.split("k")[0];
val = parseFloat(val);
val = val * 1000;
}
val = parseInt(val);
return Math.floor(val)
}
18 changes: 12 additions & 6 deletions lib/profile_contribs.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,21 @@
* @returns {object} data - the complete GitHub Profile for the username
*/
module.exports = function profile($, data) {
var c = $('.day');
console.log(data)
var c = $('.ContributionCalendar-day');
var matrix = {};
for(var i = 0; i < c.length; i++) {
var e = c[i].attribs; // the entry
matrix[e['data-date']] = {
fill: e['fill'],
count: parseInt(e['data-count'], 10),
x: e['x'],
y: e['y']

var id = e.id.replace('contribution-day-component-','')
// console.log(e.id, id)
if (e['data-date']) {
matrix[e['data-date']] = {
fill: e['fill'],
count: parseInt(e['data-count'], 10),
x: e['data-ix'],
y: id.split('-')[0]
}
}
}
// console.log(matrix)
Expand Down
16 changes: 10 additions & 6 deletions lib/repo.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@ function repo ($, url, callback) {


data.tags = []
data.usedby = parse_int($('.social-count').text());
data.watchers = parse_int(badges['0'].children[0].data);
data.stars = parse_int(badges['1'].children[0].data);
data.forks = parse_int(badges['2'].children[0].data);
data.commits = parse_int($(`${selectors.COMMIT}`).first().text());
data.usedby = parse_int($('.hx_flex-avatar-stack').next().text().trim());
data.watchers = parse_int(strip($('.octicon-eye').parent().text().trim()));
data.stars = parse_int(strip($('.Layout-sidebar .octicon-star').parent().text().trim()));
data.forks = parse_int(strip($('.Layout-sidebar .octicon-repo-forked').parent().text().trim()));
data.commits = parse_int($('.octicon-history').parent().text().trim());
data.branches = parse_int($('.octicon-git-branch').next().text());
data.releases = parse_int($('.octicon-tag').next().text());
// data.releases = parse_int($('.octicon-tag').next().text());
data.langs = []; // languages used in the repo:
$(`${selectors.LANGUAGES}`).first().find(`${selectors.TOPIC_TAG}`)
.each(function(i,a){
Expand All @@ -51,3 +51,7 @@ function repo ($, url, callback) {
}

module.exports = repo;

function strip(str) {
return str.split('\n')[0]
}
3 changes: 2 additions & 1 deletion lib/stars_watchers.js
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,9 @@
*/
module.exports = function stargazers_watchers ($, url, callback) {
var data = { entries : [], url: url, type: 'stars' };
data.stars = $('.tabnav .Counter').text().trim()

$('.follow-list img.avatar').each(function (i, el) {
$('.list-style-none img.avatar').each(function (i, el) {
var src = el.attribs.src;
var parts = src.split('/');
var uid = parseInt(parts[parts.length-1].split('?')[0], 10);
Expand Down
9 changes: 1 addition & 8 deletions lib/switcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,6 @@ module.exports = function switcher (url, callback) {
// scraper = 'starred';
// }
// else if($('.commits').length > 0) {

else if(url.match(/people/)) {
scraper = 'people';
}
else {
scraper = 'repo';
}
Expand All @@ -88,12 +84,9 @@ module.exports = function switcher (url, callback) {
// else if(url.match(/labels/)) {
// scraper = 'labels';
// }
else if($('.issue').length > 0) {
if($('.issue').length > 0) {
scraper = 'issue';
}
else {
scraper = 'repo';
}

// else { // else if(url.match(/issues/)) {
// scraper = 'issues';
Expand Down
Loading

0 comments on commit d64fde5

Please sign in to comment.