-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex.js
executable file
·74 lines (61 loc) · 2.2 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env node
'use strict';
const puppeteer = require('puppeteer');
const parser = require('node-html-parser');
const fs = require('fs');
const url = 'https://www.notion.so/Paper-Notes-by-Vitaly-Kurin-97827e14e5cd4183815cfe3a5ecf2f4c';
async function getNotionPage(page, url) {
await page.goto(url, { waitUntil: 'networkidle0' });
const content = await page.content();
const root = parser.parse(content);
const notionPage = root.querySelector('#notion-app');
return notionPage;
}
async function getExistingUrls() {
const data = await fs.promises.readFile('pairs.txt');
const lines = ("" + data)
.split('\n')
.map(x => x.split(',')[1])
.filter(x => x)
.map(x => x.split('https://www.notion.so')[1])
.filter(x => x);
return lines;
}
async function parse(url) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const existingUrls = await getExistingUrls();
const notionPage = await getNotionPage(page, url);
const links = notionPage.querySelectorAll('a');
let notionLinks = [];
for (let idx = 0; idx < links.length; idx++) {
let link = links[idx];
let l = link.rawAttrs.split('href="')[1].split('" ')[0];
if (l.startsWith("/")) {
// have we reached a known url yet?
if (existingUrls.indexOf(l) !== -1) {
break;
}
notionLinks.push("https://www.notion.so" + l);
}
}
console.log("Number of new links: ", notionLinks.length);
notionLinks.reverse();
for (const l of notionLinks) {
await new Promise(resolve => setTimeout(resolve, 500 + 500*Math.random()));
const p = await getNotionPage(page, l);
const spans = p.querySelectorAll('span');
let aLink = undefined;
for (let s of spans) {
let text = (s.childNodes[0] || {}).rawText || "";
if (text.indexOf("arxiv.org") !== -1) {
aLink = text;
break;
}
}
fs.appendFile('pairs.txt', aLink + ',' + l + '\n', () => {});
}
browser.close();
return;
};
parse(url);