Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[react-pdf] text tokenize 및 chunking 로직을 적용하고, custom click handler 로직을 적용합니다. #31

Merged
merged 3 commits into from
May 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions packages/react-pdf/src/components/Pages.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,14 @@ import {PageSvg} from './page/Svg'
export interface PagesProps {
renderMode?: 'canvas' | 'svg'
lazyLoading?: boolean
tokenize?: boolean
}

export const Page = memo(function Page({renderMode, pageNumber}: PagesProps & {pageNumber: number}) {
export const Page = memo(function Page({
renderMode,
tokenize,
pageNumber,
}: Omit<PagesProps, 'lazyLoading'> & {pageNumber: number}) {
const {pdf} = usePdfContext()
const [page, setPage] = useState<PDFPageProxy | undefined>()

Expand All @@ -34,13 +39,13 @@ export const Page = memo(function Page({renderMode, pageNumber}: PagesProps & {p
<div style={{position: 'relative'}} data-page-number={pageNumber}>
{renderMode === 'canvas' && <PageCanvas page={page} />}
{renderMode === 'svg' && <PageSvg page={page} />}
<TextLayer page={page} />
<TextLayer page={page} tokenize={tokenize} />
<AnnotationLayer page={page} />
</div>
)
})

export const Pages = memo(function Pages({renderMode, lazyLoading, children}: PropsWithChildren<PagesProps>) {
export const Pages = memo(function Pages({renderMode, lazyLoading, tokenize, children}: PropsWithChildren<PagesProps>) {
const {pdf} = usePdfContext()
const pageNumbers = useMemo(() => Array.from({length: pdf.numPages}, (_, index) => index + 1), [pdf.numPages])
const [renderPages, setRenderPages] = useState<number[]>(pdf.numPages > 0 ? [1] : [])
Expand All @@ -61,7 +66,7 @@ export const Pages = memo(function Pages({renderMode, lazyLoading, children}: Pr
{(lazyLoading ? renderPages : pageNumbers).map((pageNumber) => {
return (
<div key={pageNumber} ref={lazyLoading && renderPages.length === pageNumber ? ref : null}>
<Page renderMode={renderMode} pageNumber={pageNumber} />
<Page renderMode={renderMode} pageNumber={pageNumber} tokenize={tokenize} />
</div>
)
})}
Expand Down
56 changes: 49 additions & 7 deletions packages/react-pdf/src/components/PdfViewer.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import {ReactNode, useState} from 'react'
import {MouseEventHandler, ReactNode, useCallback, useState} from 'react'

import {PDFProvider} from '../contexts/pdf'
import {useIsomorphicLayoutEffect} from '../hooks/useIsomorphicLayoutEffect'
Expand All @@ -23,7 +23,15 @@ export type PDFViewerProps = PagesProps & {
}
}

export function PDFViewer({pdfUrl, renderMode = 'canvas', header, footer, options}: PDFViewerProps) {
export function PDFViewer({
pdfUrl,
renderMode = 'canvas',
tokenize,
onClickWords,
header,
footer,
options,
}: PDFViewerProps) {
const [pdf, setPdf] = useState<PDFDocumentProxy | undefined>()

useIsomorphicLayoutEffect(() => {
Expand All @@ -47,15 +55,49 @@ export function PDFViewer({pdfUrl, renderMode = 'canvas', header, footer, option
init()
}, [options?.cMapCompressed, options?.cMapUrl, options?.withCredentials, pdf, pdfUrl])

const handleClickWords: MouseEventHandler<HTMLDivElement | HTMLSpanElement> = useCallback(
async (e) => {
if (!onClickWords) {
return
}
const element = e.target as HTMLElement
const clickedText = (element?.innerText || '').trim()
const isSpanTag = element.tagName === 'SPAN'
if (!clickedText || !isSpanTag) {
return
}
for await (const {target, callback} of onClickWords) {
let result = false
if (typeof target === 'string') {
result = target === clickedText
}
if (target instanceof RegExp) {
result = target.test(clickedText)
}
if (result) {
await callback()
return
}
}
},
[onClickWords],
)

if (!pdf) {
return null
}

return (
<PDFProvider pdf={pdf} options={options}>
{header}
<Pages renderMode={renderMode} lazyLoading={options?.lazyLoading || true} />
{footer}
</PDFProvider>
<div onClick={handleClickWords}>
<PDFProvider pdf={pdf} options={options}>
{header}
<Pages
renderMode={renderMode}
lazyLoading={options?.lazyLoading || true}
tokenize={tokenize ?? (onClickWords || []).length > 0}
/>
{footer}
</PDFProvider>
</div>
)
}
11 changes: 6 additions & 5 deletions packages/react-pdf/src/components/layer/Text.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import {memo, useCallback, useMemo, useState} from 'react'
import classNames from 'classnames/bind'

import {useIsomorphicLayoutEffect} from '../../hooks/useIsomorphicLayoutEffect'
import {mergeTextItems} from '../../utils/text'
import styles from './Text.module.scss'

import type {TextContent, PDFPageProxy, TextContentItem} from '../../pdfjs-dist/types/pdfjs'
Expand Down Expand Up @@ -68,20 +69,20 @@ export const TextLayerItem = memo(function TextLayerItem({

interface TextLayerProps {
page: PDFPageProxy
tokenize?: boolean
}

export const TextLayer = memo(function TextLayer({page}: TextLayerProps) {
export const TextLayer = memo(function TextLayer({page, tokenize}: TextLayerProps) {
const [texts, setTexts] = useState<TextContent | undefined>()
const viewport = page.getViewport({scale: 1})

useIsomorphicLayoutEffect(() => {
async function init() {
const textContent = await page.getTextContent()
// TODO: chunking
setTexts(textContent)
const {items, styles: textStyles} = await page.getTextContent()
setTexts({items: mergeTextItems(items, {tokenize}), styles: textStyles})
}
init()
}, [page])
}, [page, tokenize])

if (!texts) {
return null
Expand Down
40 changes: 40 additions & 0 deletions packages/react-pdf/src/utils/text.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import type {TextContentItem} from '../pdfjs-dist/types/pdfjs'

function tokenizeTextItems(texts: TextContentItem[]) {
return texts.reduce((result, textItem) => {
const {str, width, transform, ...rest} = textItem
const splittedStr = str.split(' ')
const strLength = str.length
const tokenizedStr = splittedStr.reduce((calculatedStr, s) => {
const currentStrWidth = s.trim().length === 0 ? 4.5 : Math.ceil((width / strLength) * s.length) + 5
const reducedStrsLength = calculatedStr.length
const {width: lastWidth, transform: lastTransform} =
reducedStrsLength === 0 ? {width: 0, transform: [...transform]} : calculatedStr[reducedStrsLength - 1]
const newTransform = [...lastTransform]
newTransform[4] += lastWidth + (reducedStrsLength === 0 ? 0 : 3.5)
calculatedStr.push({str: s, width: currentStrWidth, transform: newTransform, ...rest})
return calculatedStr
}, [] as TextContentItem[])
return [...result, ...tokenizedStr]
}, [] as TextContentItem[])
}

export function mergeTextItems(texts: TextContentItem[], options?: {tokenize?: boolean}) {
const mergedTextItems = texts.reduce((result, token, index) => {
if (index === 0) {
result.push(token)
return result
}
const prev = result[result.length - 1]
// y 값을 비교하여, 같은 줄인지 확인
if (prev.transform[5] === token.transform[5]) {
prev.str = prev.str + token.str
prev.width = prev.width + token.width
} else {
result.push(token)
}

return result
}, [] as TextContentItem[])
return options?.tokenize ? tokenizeTextItems(mergedTextItems) : mergedTextItems
}
Loading