diff --git a/package.json b/package.json index aef88575b..23577e9bf 100644 --- a/package.json +++ b/package.json @@ -107,6 +107,7 @@ "foreman": "^3.0.1", "jquery": "^3.5.0", "kind-of": "^6.0.3", + "mime": "^2.4.6", "minimist": "^1.2.2", "node-forge": "^0.10.0", "node-sass": "^4.14.1", diff --git a/src/models/applicationState.ts b/src/models/applicationState.ts index 517e12d4a..b7cf121af 100644 --- a/src/models/applicationState.ts +++ b/src/models/applicationState.ts @@ -164,6 +164,7 @@ export interface IAsset { ocr?: any, isRunningOCR?: boolean, cachedImage?: string, + mimeType?: string, } /** @@ -346,6 +347,14 @@ export enum AssetType { TIFF = 6, } +export enum AssetMimeType { + PDF = "application/pdf", + TIFF = "image/tiff", + JPG = "image/jpg", + PNG = "image/png", + BMP = "image/bmp", +} + /** * @name - Asset State * @description - Defines the state of the asset with regard to the tagging process diff --git a/src/react/components/pages/editorPage/canvas.tsx b/src/react/components/pages/editorPage/canvas.tsx index 045bfd697..d8078093c 100644 --- a/src/react/components/pages/editorPage/canvas.tsx +++ b/src/react/components/pages/editorPage/canvas.tsx @@ -1158,7 +1158,7 @@ export default class Canvas extends React.Component return; } try { - const ocr = await this.ocrService.getRecognizedText(asset.path, asset.name, this.setOCRStatus, force); + const ocr = await this.ocrService.getRecognizedText(asset.path, asset.name, asset.mimeType, this.setOCRStatus, force); if (asset.id === this.state.currentAsset.asset.id) { // since get OCR is async, we only set currentAsset's OCR this.setState({ diff --git a/src/react/components/pages/editorPage/editorPage.tsx b/src/react/components/pages/editorPage/editorPage.tsx index dfdfadc97..001f9889f 100644 --- a/src/react/components/pages/editorPage/editorPage.tsx +++ b/src/react/components/pages/editorPage/editorPage.tsx @@ -720,7 +720,7 @@ export default class EditorPage extends React.Component => this.pollForFetchAPI(() => fetch(filePath), 1000, 200); + const response = await getFetchSteam(); + checkFileType = await BrowserFileType.fromStream(response.body); + } catch { + // do nothing + } corruptFileName = fileName.split("%2F").pop().replace(/%20/g, " "); } - if (!types) { + let fileType; + let mimeType; + if (checkFileType) { + fileType = checkFileType.ext; + mimeType = checkFileType.mime; + } + + if (!fileType) { console.error(interpolate(strings.editorPage.assetWarning.incorrectFileExtension.failedToFetch, { fileName: corruptFileName.toLocaleUpperCase() })); } - // If file was renamed/spoofed - fix file extension to true MIME type and show message - else if (!types.includes(assetFormat)) { - assetFormat = types[0]; + // If file was renamed/spoofed - fix file extension to true MIME if it's type is in supported file types and show message + else if (fileType !== assetFormat) { + assetFormat = fileType; + assetMimeType = mimeType; console.error(`${strings.editorPage.assetWarning.incorrectFileExtension.attention} ${corruptFileName.toLocaleUpperCase()} ${strings.editorPage.assetWarning.incorrectFileExtension.text} ${corruptFileName.toLocaleUpperCase()}`); } } @@ -209,6 +228,7 @@ export class AssetService { name: fileName, path: filePath, size: null, + mimeType: assetMimeType, }; } @@ -233,36 +253,6 @@ export class AssetService { } } - // If extension of a file was spoofed, we fetch only first 4 or needed amount of bytes of the file and read MIME type - public static async getMimeType(uri: string): Promise { - const getFirst4bytes = (): Promise => this.pollForFetchAPI(() => fetch(uri, { headers: { range: `bytes=0-${mimeBytesNeeded}` } }), 1000, 200); - let first4bytes: Response; - try { - first4bytes = await getFirst4bytes() - } catch { - return new Promise((resolve) => { - resolve(null); - }); - } - const arrayBuffer: ArrayBuffer = await first4bytes.arrayBuffer(); - const blob: Blob = new Blob([new Uint8Array(arrayBuffer).buffer]); - const isMime = (bytes: Uint8Array, mime: IMime): boolean => { - return mime.pattern.every((p, i) => !p || bytes[i] === p); - }; - const fileReader: FileReader = new FileReader(); - - return new Promise((resolve, reject) => { - fileReader.onloadend = (e) => { - if (!e || !fileReader.result) { - return []; - } - const bytes: Uint8Array = new Uint8Array(fileReader.result as ArrayBuffer); - const type: string[] = imageMimes.filter((mime) => isMime(bytes, mime))?.[0]?.types; - resolve(type || []); - }; - fileReader.readAsArrayBuffer(blob); - }); - } private assetProviderInstance: IAssetProvider; private storageProviderInstance: IStorageProvider; diff --git a/src/services/ocrService.ts b/src/services/ocrService.ts index 4243b2628..031020f5a 100644 --- a/src/services/ocrService.ts +++ b/src/services/ocrService.ts @@ -33,6 +33,7 @@ export class OCRService { public async getRecognizedText( filePath: string, fileName: string, + mimeType: string, onStatusChanged?: (ocrStatus: OcrStatus) => void, rewrite?: boolean ): Promise { @@ -47,11 +48,11 @@ export class OCRService { notifyStatusChanged(OcrStatus.loadingFromAzureBlob); ocrJson = await this.readOcrFile(ocrFileName); if (!this.isValidOcrFormat(ocrJson) || rewrite) { - ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName); + ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName, mimeType); } } catch (e) { notifyStatusChanged(OcrStatus.runningOCR); - ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName); + ocrJson = await this.fetchOcrUriResult(filePath, fileName, ocrFileName, mimeType); } finally { notifyStatusChanged(OcrStatus.done); } @@ -81,7 +82,7 @@ export class OCRService { } } - private fetchOcrUriResult = async (filePath: string, fileName: string, ocrFileName: string) => { + private fetchOcrUriResult = async (filePath: string, fileName: string, ocrFileName: string, mimeType: string) => { try { let body; let headers; @@ -93,10 +94,8 @@ export class OCRService { ] ); body = bodyAndType[0]; - const fileType = bodyAndType[1].mime; - headers = { "Content-Type": fileType, "cache-control": "no-cache" }; - } - else { + headers = { "Content-Type": mimeType, "cache-control": "no-cache" }; + } else { body = { url: filePath }; headers = { "Content-Type": "application/json" }; } diff --git a/yarn.lock b/yarn.lock index b17df3d3c..a032d80ae 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8438,7 +8438,7 @@ mime@1.6.0: resolved "https://registry.yarnpkg.com/mime/-/mime-1.6.0.tgz#32cd9e5c64553bd58d19a568af452acff04981b1" integrity sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg== -mime@^2.4.4, mime@^2.4.5: +mime@^2.4.4, mime@^2.4.5, mime@^2.4.6: version "2.4.6" resolved "https://registry.yarnpkg.com/mime/-/mime-2.4.6.tgz#e5b407c90db442f2beb5b162373d07b69affa4d1" integrity sha512-RZKhC3EmpBchfTGBVb8fb+RL2cWyw/32lshnsETttkBAyAUXSGHxbEJWWRXc751DrIxG1q04b8QwMbAwkRPpUA==