Skip to content

Commit

Permalink
UTF-8以外の記事の文字コードを対応
Browse files Browse the repository at this point in the history
fix #42
  • Loading branch information
cp-20 committed Jan 31, 2024
1 parent 40c20f9 commit a4e2f9a
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 16 deletions.
5 changes: 3 additions & 2 deletions packages/lib/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@
"@mozilla/readability": "^0.5.0",
"@read-stack/tsconfig": "workspace:*",
"cheerio": "1.0.0-rc.12",
"iconv-lite": "^0.6.3",
"jsdom": "^23.0.1",
"zod": "^3.22.4"
},
"devDependencies": {
"@types/jsdom": "^21.1.6",
"@read-stack/eslint-config": "workspace:*"
"@read-stack/eslint-config": "workspace:*",
"@types/jsdom": "^21.1.6"
}
}
71 changes: 57 additions & 14 deletions packages/lib/src/crawler/readability.ts
Original file line number Diff line number Diff line change
@@ -1,29 +1,72 @@
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
import { decode } from 'iconv-lite';

import type { ArticleResponse } from '@/crawler';

export const fetchUsingReadability = async (
url: string,
): Promise<ArticleResponse | null> => {
const response = await fetch(url);
const html = await response.text();
const additionalCharset = ['shift_jis', 'euc-jp'];
const additionalCharsetRegex = additionalCharset.map((c) => new RegExp(c, 'i'));

const dom = new JSDOM(html);
const reader = new Readability(dom.window.document);
const article = reader.parse();
const detectCharset = (document: Document) => {
const html4 = document
.querySelector('meta[http-equiv="content-type"]')
?.getAttribute('content')
?.match(/charset=(?<charset>.+)/)?.[1];
const html5 = document
.querySelector('meta[charset]')
?.getAttribute('charset');
const charset = html4 ?? html5 ?? null;
const isAdditionalCharset = additionalCharsetRegex.map(
(r) => charset !== null && r.test(charset),
);

const ogImageUrl =
dom.window.document.querySelector<HTMLMetaElement>(
'meta[property="og:image"]',
)?.content ?? null;
return isAdditionalCharset.reduce(
(prev, current, i) => (current ? additionalCharset[i] : prev),
'utf-8',
);
};

const parseArticle = (
url: string,
document: Document,
): ArticleResponse | null => {
const reader = new Readability(document);
const article = reader.parse();

if (article === null) return null;

const ogImageUrl =
document.querySelector<HTMLMetaElement>('meta[property="og:image"]')
?.content ?? null;

return {
title: article.title,
body: article.textContent,
url,
title: article.title,
body: article.content,
ogImageUrl,
};
};

const decoder = new TextDecoder();

export const fetchUsingReadability = async (
url: string,
): Promise<ArticleResponse | null> => {
const response = await fetch(url);
const arrayBuffer = await response.arrayBuffer();
const html = decoder.decode(arrayBuffer);

const dom = new JSDOM(html);
const document = dom.window.document;

const charset = detectCharset(document);
if (charset !== 'utf-8') {
const html2 = decode(Buffer.from(arrayBuffer), charset);
const dom2 = new JSDOM(html2);
const document2 = dom2.window.document;

return parseArticle(url, document2);
}

return parseArticle(url, document);
};
3 changes: 3 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit a4e2f9a

Please sign in to comment.