From b625f61a3aade6a0587be3b46bd137f94f48bf27 Mon Sep 17 00:00:00 2001 From: Corentin Thomasset Date: Wed, 22 Jan 2025 14:55:31 +0100 Subject: [PATCH] feat(extractors): refactor image extraction to use child_process for Tesseract --- fixtures/007.expected | 10 ++--- package.json | 1 - pnpm-lock.yaml | 69 +++------------------------------ src/extractors/img.extractor.ts | 23 ++++++++--- 4 files changed, 28 insertions(+), 75 deletions(-) diff --git a/fixtures/007.expected b/fixtures/007.expected index 0fd0daf..84719b9 100644 --- a/fixtures/007.expected +++ b/fixtures/007.expected @@ -1,4 +1,4 @@ -at his touch of a certain icy pang along my blood. “Come, sir,” said I. +at his touch ofa certain icy pang along my blood. “Come, sir,’ said I. “You forget that I have not yet the pleasure of your acquaintance. Be seated, if you please.” And I showed him an example, and sat down myself in my customary seat and with as fair an imitation of my or- @@ -6,15 +6,15 @@ dinary manner to a patient, as the lateness of the hour, the nature of my preoccupations, and the horror I had of my visitor, would suffer me to muster. -“I beg your pardon, Dr. Lanyon,” he replied civilly enough. “What +“I beg your pardon, Dr. Lanyon,’ he replied civilly enough. “What you say is very well founded; and my impatience has shown its heels to my politeness. I come here at the instance of your colleague, Dr. Henry Jekyll, on a piece of business of some moment; and I under- stood...” He paused and put his hand to his throat, and I could see, in spite of his collected manner, that he was wrestling against the -approaches of the hysteria—“T understood, a drawer...” +approaches of the hysteria—“I understood, a drawer...” -But here I took pity on my visitor's suspense, and some perhaps +But here I took pity on my visitor’s suspense, and some perhaps on my own growing curiosity. “There it is, sir,” said I, pointing to the drawer, where it lay on the @@ -25,7 +25,7 @@ heart: I could hear his teeth grate with the convulsive action of his jaws; and his face was so ghastly to see that I grew alarmed both for his life and reason. -“Compose yourself,” said I. +“Compose yourself,’ said I. He turned a dreadful smile to me, and as if with the decision of despair, plucked away the sheet. At sight of the contents, he uttered diff --git a/package.json b/package.json index 7f566c5..7e35412 100644 --- a/package.json +++ b/package.json @@ -46,7 +46,6 @@ "release": "bumpp --commit --tag --push" }, "dependencies": { - "tesseract.js": "^6.0.0", "unpdf": "^0.12.1" }, "devDependencies": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 51edc59..9760802 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -8,9 +8,6 @@ importers: .: dependencies: - tesseract.js: - specifier: ^6.0.0 - version: 6.0.0 unpdf: specifier: ^0.12.1 version: 0.12.1 @@ -832,9 +829,6 @@ packages: balanced-match@1.0.2: resolution: {integrity: sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==} - bmp-js@0.1.0: - resolution: {integrity: sha512-vHdS19CnY3hwiNdkaqk93DvjVLfbEcI8mys4UjuWrlX1haDmroo8o4xCzh4wD6DGV6HxRCyauwhHRqMTfERtjw==} - boolbase@1.0.0: resolution: {integrity: sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww==} @@ -1509,9 +1503,6 @@ packages: resolution: {integrity: sha512-AXcZb6vzzrFAUE61HnN4mpLqd/cSIwNQjtNWR0euPm6y0iqx3G4gOXaIDdtdDwZmhwe82LA6+zinmW4UBWVePQ==} engines: {node: '>=16.17.0'} - idb-keyval@6.2.1: - resolution: {integrity: sha512-8Sb3veuYCyrZL+VBt9LJfZjLUPWVvqn8tG28VqYNFCo43KHcKuq+b4EiXGeuaLAQWL2YmyDgMp2aSpH9JHsEQg==} - ignore@5.3.2: resolution: {integrity: sha512-hsBTNUqQTDwkWtcdYI2i06Y/nUBEsNEDJKjWdigLvegy8kDuJAS8uRlpkkcQpyEXL0Z/pjDy5HBmMjRCJ2gq+g==} engines: {node: '>= 4'} @@ -1572,9 +1563,6 @@ packages: resolution: {integrity: sha512-LnQR4bZ9IADDRSkvpqMGvt/tEJWclzklNgSw48V5EAaAeDd6qGvN8ei6k5p0tvxSR171VmGyHuTiAOfxAbr8kA==} engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} - is-url@1.2.4: - resolution: {integrity: sha512-ITvGim8FhRiYe4IQ5uHSkj7pVaPDrCTkNd3yq3cV7iZAcJdHTUMPMEHcqSOy9xZ9qFenQCvi+2wjH9a1nXqHww==} - isexe@2.0.0: resolution: {integrity: sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==} @@ -1998,10 +1986,6 @@ packages: resolution: {integrity: sha512-1FlR+gjXK7X+AsAHso35MnyN5KqGwJRi/31ft6x0M194ht7S+rWAvd7PHss9xSKMzE0asv1pyIHaJYq+BbacAQ==} engines: {node: '>=12'} - opencollective-postinstall@2.0.3: - resolution: {integrity: sha512-8AV/sCtuzUeTo8gQK5qDZzARrulB3egtLzFgteqB2tcT4Mw7B8Kt7JcDHmltjz6FOAHsvTevk70gZEbhM4ZS9Q==} - hasBin: true - optionator@0.9.4: resolution: {integrity: sha512-6IpQ7mKUxRcZNLIObR0hz7lxsapSSIYNZJwXPGeF0mTVqGKFIXj1DQcMoT22S3ROcLyY/rz0PWaWZ9ayWmad9g==} engines: {node: '>= 0.8.0'} @@ -2327,9 +2311,6 @@ packages: resolution: {integrity: sha512-J8rn6v4DBb2nnFqkqwy6/NnTYMcgLA+sLr0iIO41qpv0n+ngb7ksag2tMRl0inb1bbO/esUwzW1vbJi7K0sI0g==} engines: {node: ^12.0.0 || ^14.0.0 || >=16.0.0} - regenerator-runtime@0.13.11: - resolution: {integrity: sha512-kY1AZVr2Ra+t+piVaJ4gxaFaReZVH40AKNo7UCX6W+dEwBo/2oZJzqfuN1qLq1oL45o56cPaTXELwrTh8Fpggg==} - regexp-ast-analysis@0.7.1: resolution: {integrity: sha512-sZuz1dYW/ZsfG17WSAG7eS85r5a0dDsvg+7BiiYR5o6lKCAtUrEwdmRmaGF6rwVj3LcmAeYkOWKEPlbPzN3Y3A==} engines: {node: ^12.0.0 || ^14.0.0 || >=16.0.0} @@ -2532,12 +2513,6 @@ packages: resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==} engines: {node: '>=10'} - tesseract.js-core@6.0.0: - resolution: {integrity: sha512-1Qncm/9oKM7xgrQXZXNB+NRh19qiXGhxlrR8EwFbK5SaUbPZnS5OMtP/ghtqfd23hsr1ZvZbZjeuAGcMxd/ooA==} - - tesseract.js@6.0.0: - resolution: {integrity: sha512-tqYCod1HwJzkeZw1l6XWx+ly2hhisGcBtak9MArhYwDAxL0NgeVhLJcUjqPxZMQtpgtVUzWcpZPryi+hnaQGVw==} - test-exclude@7.0.1: resolution: {integrity: sha512-pFYqmTw68LXVjeWJMST4+borgQP2AyMNbg1BpZh9LbyhUeNkeaPF9gzfPGUAnSMV3qPYdWUwDIjjCLiSDOl7vg==} engines: {node: '>=18'} @@ -2730,9 +2705,6 @@ packages: peerDependencies: eslint: '>=6.0.0' - wasm-feature-detect@1.8.0: - resolution: {integrity: sha512-zksaLKM2fVlnB5jQQDqKXXwYHLQUVH9es+5TOOHwGOVJOCeRBCiPjwSg+3tN2AdTCzjgli4jijCH290kXb/zWQ==} - webidl-conversions@3.0.1: resolution: {integrity: sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==} @@ -2802,9 +2774,6 @@ packages: resolution: {integrity: sha512-rVksvsnNCdJ/ohGc6xgPwyN8eheCxsiLM8mxuE/t/mOVqJewPuO1miLpTHQiRgTKCLexL4MeAFVagts7HmNZ2Q==} engines: {node: '>=10'} - zlibjs@0.3.1: - resolution: {integrity: sha512-+J9RrgTKOmlxFSDHo0pI1xM6BLVUv+o0ZT9ANtCxGkjIVCCUdx9alUF8Gm+dGLKbkkkidWIHFDZHDMpfITt4+w==} - zwitch@2.0.4: resolution: {integrity: sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A==} @@ -3590,8 +3559,6 @@ snapshots: balanced-match@1.0.2: {} - bmp-js@0.1.0: {} - boolbase@1.0.0: {} brace-expansion@1.1.11: @@ -4418,8 +4385,6 @@ snapshots: human-signals@5.0.0: {} - idb-keyval@6.2.1: {} - ignore@5.3.2: {} import-fresh@3.3.0: @@ -4468,8 +4433,6 @@ snapshots: is-stream@3.0.0: {} - is-url@1.2.4: {} - isexe@2.0.0: {} istanbul-lib-coverage@3.2.2: {} @@ -4982,6 +4945,7 @@ snapshots: node-fetch@2.7.0: dependencies: whatwg-url: 5.0.0 + optional: true node-releases@2.0.19: {} @@ -5038,8 +5002,6 @@ snapshots: dependencies: mimic-fn: 4.0.0 - opencollective-postinstall@2.0.3: {} - optionator@0.9.4: dependencies: deep-is: 0.1.4 @@ -5343,8 +5305,6 @@ snapshots: dependencies: '@eslint-community/regexpp': 4.12.1 - regenerator-runtime@0.13.11: {} - regexp-ast-analysis@0.7.1: dependencies: '@eslint-community/regexpp': 4.12.1 @@ -5561,22 +5521,6 @@ snapshots: mkdirp: 1.0.4 yallist: 4.0.0 - tesseract.js-core@6.0.0: {} - - tesseract.js@6.0.0: - dependencies: - bmp-js: 0.1.0 - idb-keyval: 6.2.1 - is-url: 1.2.4 - node-fetch: 2.7.0 - opencollective-postinstall: 2.0.3 - regenerator-runtime: 0.13.11 - tesseract.js-core: 6.0.0 - wasm-feature-detect: 1.8.0 - zlibjs: 0.3.1 - transitivePeerDependencies: - - encoding - test-exclude@7.0.1: dependencies: '@istanbuljs/schema': 0.1.3 @@ -5606,7 +5550,8 @@ snapshots: dependencies: eslint-visitor-keys: 3.4.3 - tr46@0.0.3: {} + tr46@0.0.3: + optional: true ts-api-utils@2.0.0(typescript@5.7.3): dependencies: @@ -5802,14 +5747,14 @@ snapshots: transitivePeerDependencies: - supports-color - wasm-feature-detect@1.8.0: {} - - webidl-conversions@3.0.1: {} + webidl-conversions@3.0.1: + optional: true whatwg-url@5.0.0: dependencies: tr46: 0.0.3 webidl-conversions: 3.0.1 + optional: true which@2.0.2: dependencies: @@ -5872,6 +5817,4 @@ snapshots: yocto-queue@0.1.0: {} - zlibjs@0.3.1: {} - zwitch@2.0.4: {} diff --git a/src/extractors/img.extractor.ts b/src/extractors/img.extractor.ts index 0638b18..6c8547a 100644 --- a/src/extractors/img.extractor.ts +++ b/src/extractors/img.extractor.ts @@ -1,5 +1,6 @@ import { Buffer } from 'node:buffer'; -import { createWorker } from 'tesseract.js'; +import { exec } from 'node:child_process'; +import { env } from 'node:process'; import { defineTextExtractor } from '../extractors.models'; export const imageExtractorDefinition = defineTextExtractor({ @@ -11,13 +12,23 @@ export const imageExtractorDefinition = defineTextExtractor({ 'image/gif', ], extract: async ({ arrayBuffer }) => { - const buffer = Buffer.from(arrayBuffer); + const binary = env.LECTURE_TESSERACT_BINARY ?? 'tesseract'; - const worker = await createWorker(); + const { stdout } = await new Promise<{ stdout: string }>((resolve, reject) => { + const child = exec(`${binary} stdin stdout`, (error, stdout) => { + if (error) { + reject(error); + } else { + resolve({ stdout }); + } + }); - const { data: { text } } = await worker.recognize(buffer); - await worker.terminate(); + child.stdin.write(Buffer.from(arrayBuffer)); + child.stdin.end(); + }); - return { content: text }; + return { + content: stdout, + }; }, });