-
Notifications
You must be signed in to change notification settings - Fork 1
/
index.js
132 lines (104 loc) · 4.18 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
const fs = require('fs');
const puppeteer = require('puppeteer');
const logger = require('pino')();
const { getNthParent, sleep } = require('./utils/common');
const { ROOT_URL } = require('./utils/constants');
const fsPromises = fs.promises;
const CHILDRENLOADED = 'div.category.with_children.loaded[aria-expanded="true"]';
const getPaginatedContentIfExist = async (browser, page) => {
// Check for pagination (more results)
const selector = `${CHILDRENLOADED} ${CHILDRENLOADED} div.courses div.paging.paging-morelink > a`;
const isPaginate = await page.$(selector);
if (isPaginate !== null) {
logger.info('This course has pagination.');
const tabHref = await page.evaluate((p) => p.href, isPaginate);
const tab = await browser.newPage();
await tab.goto(tabHref);
// We can bring the tab to the front if we want to see it
// await tab.bringToFront();
await tab.waitForSelector('div[role="main"] div.content');
const paginateContent = await tab.$$eval(
'div[role="main"] div.content div.courses div.coursename > a',
(xs) => xs.map((e) => ({ subject: e.innerHTML, href: e.href }))
);
await tab.close();
return paginateContent;
}
return [];
};
(async () => {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null,
args: ['--window-size=1200,800'],
});
const page = await browser.newPage();
page.setDefaultTimeout(0);
try {
await page.goto(ROOT_URL);
await page.waitForSelector('div.subcategories');
const years = await page.$$eval(
'div.subcategories > div.category > div.info > h3.categoryname > a',
(xs) => xs.map((e) => ({ year: e.innerHTML, href: e.href }))
);
const data = [];
for (const { year, href } of years) {
const link = await page.$(`a[href="${href}"]`);
const row = await link.getProperty('parentNode');
await row.click();
await sleep(2);
await page.waitForSelector(CHILDRENLOADED);
const yearsContent = await page.$$eval(`${CHILDRENLOADED} h4.categoryname > a`, (xs) =>
xs.map((e) => ({ period: e.innerHTML, href: e.href }))
);
const content = [];
for (const { href: hrefYearsContent, period } of yearsContent) {
logger.info(`Period: ${period}`);
const periodLink = await page.$(`a[href="${hrefYearsContent}"]`);
const periodRow = await periodLink.getProperty('parentNode');
const parent = await getNthParent(periodRow, 2);
const classes = await parent.getProperty('className');
if (!classes._remoteObject.value.includes('with_children')) continue;
// Specific focus here. If the page didn't focus the element before click, sometimes the periodRow.click
// broke with the msg "Node is either not visible or not an HTMLElement"
// Reference 1: https://github.com/puppeteer/puppeteer/issues/1769
// Reference 2: https://github.com/puppeteer/puppeteer/issues/1805
logger.info('Pre periodRowClick');
await page.focus(
`${CHILDRENLOADED} div.category.notloaded.with_children.collapsed h4.categoryname`
);
await periodRow.click();
await sleep(2);
await page.waitForSelector(`${CHILDRENLOADED} ${CHILDRENLOADED}`);
const periodContent = await page.$$eval(
`${CHILDRENLOADED} ${CHILDRENLOADED} div.courses div.coursename > a`,
(xs) => xs.map((e) => ({ subject: e.innerHTML, href: e.href }))
);
const paginatedContent = await getPaginatedContentIfExist(browser, page);
periodContent.push(...paginatedContent);
content.push({
period,
href,
content: [...periodContent],
});
await periodRow.click();
await sleep(3);
}
data.push({
year,
href,
periods: [...content],
});
await row.click();
await sleep(2);
}
const jsonContent = JSON.stringify(data, null, 4);
await fsPromises.mkdir('data', { recursive: true });
await fsPromises.writeFile('data/initial.json', jsonContent, 'utf8');
await browser.close();
process.exit(0);
} catch (error) {
logger.error(`Error: ${error}`);
process.exit(1);
}
})();