From 89ac389cb2a804c5a8e5f91a61f304916cc8d737 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Wed, 2 Dec 2020 17:29:40 +0100 Subject: [PATCH 01/51] Introduce solr highlighting --- Classes/Plugin/Eid/SearchInDocument.php | 2 ++ .../ApacheSolr/configsets/dlf/conf/schema.xml | 13 +++++++++++-- .../configsets/dlf/conf/solrconfig.xml | 17 +++++++++++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/Classes/Plugin/Eid/SearchInDocument.php b/Classes/Plugin/Eid/SearchInDocument.php index 85e7f0425..bd01cc421 100644 --- a/Classes/Plugin/Eid/SearchInDocument.php +++ b/Classes/Plugin/Eid/SearchInDocument.php @@ -63,7 +63,9 @@ public function main(ServerRequestInterface $request) $hl = $query->getHighlighting(); $hl->setFields([$fields['fulltext']]); $hl->setUseFastVectorHighlighter(true); + var_dump($query); $results = $solr->service->select($query); + var_dump($results); $output['numFound'] = $results->getNumFound(); $highlighting = $results->getHighlighting(); foreach ($results as $result) { diff --git a/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml b/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml index 517439d40..103a69995 100644 --- a/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml +++ b/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml @@ -39,8 +39,14 @@ limitations under the License. - + + + + + + + @@ -55,6 +61,9 @@ limitations under the License. + + + @@ -127,7 +136,7 @@ limitations under the License. - + diff --git a/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml b/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml index 3e2944740..b3b551cad 100644 --- a/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml +++ b/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml @@ -83,6 +83,8 @@ + + + + + + query + ocrHighlight + highlight + + + + + + From 9497f6a7fce96d39bfb5ad80f00f76164376d282 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Thu, 8 Apr 2021 16:10:43 +0200 Subject: [PATCH 19/51] Check if passed id is number or numeric string Previous check was converting numeric string to int if it was starting with number --- Classes/Plugin/Eid/SearchInDocument.php | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Classes/Plugin/Eid/SearchInDocument.php b/Classes/Plugin/Eid/SearchInDocument.php index aaccb1b02..1c1be4386 100644 --- a/Classes/Plugin/Eid/SearchInDocument.php +++ b/Classes/Plugin/Eid/SearchInDocument.php @@ -107,6 +107,6 @@ private function getQuery($fields, $parameters) { } private function getUid($uid) { - return intval($uid) > 0 ? intval($uid) : $uid; + return is_numeric($uid) > 0 ? intval($uid) : $uid; } } From 7a156953b77f2a6648d55f141699e677b45457d0 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Wed, 28 Apr 2021 10:03:09 +0200 Subject: [PATCH 20/51] Remove indexing of full text from file Source pointer can't be on any other server han SOLR, so it is not usable here. --- Configuration/ApacheSolr/configsets/dlf/conf/schema.xml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml b/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml index 103a69995..056a0c59c 100644 --- a/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml +++ b/Configuration/ApacheSolr/configsets/dlf/conf/schema.xml @@ -42,8 +42,6 @@ limitations under the License. - - From d838dc0784424b4f6c58d5b91159cab1dc72f9d8 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Wed, 28 Apr 2021 11:31:55 +0200 Subject: [PATCH 21/51] Save to index full OCR + add working logging for document and indexing --- Classes/Common/Document.php | 81 ++++++++++++++++++++++++++++++++- Classes/Common/Indexer.php | 21 +++++---- Classes/Common/MetsDocument.php | 16 +++++++ 3 files changed, 109 insertions(+), 9 deletions(-) diff --git a/Classes/Common/Document.php b/Classes/Common/Document.php index e36f4ea6d..d0d4169d2 100644 --- a/Classes/Common/Document.php +++ b/Classes/Common/Document.php @@ -570,8 +570,9 @@ public static function &getInstance($uid, $pid = 0, $forceReload = false) if (!empty($extConf['caching'])) { Helper::saveToSession(self::$registry, get_class($instance)); } + $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(__CLASS__); } - $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance)); + // Return new instance. return $instance; } @@ -637,6 +638,82 @@ public function getPhysicalPage($logicalPage) return 1; } + /** + * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas. Text might be + * given as ALTO for METS or as annotations or ALTO for IIIF resources. + * + * @access public + * + * @abstract + * + * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property + * of the Manifest / Range (IIIF) + * + * @return string The OCR full text + */ + public abstract function getFullText($id); + + /** + * This extracts the OCR full text for a physical structure node / IIIF Manifest / Canvas from an + * XML full text representation (currently only ALTO). For IIIF manifests, ALTO documents have + * to be given in the Canvas' / Manifest's "seeAlso" property. + * + * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property + * of the Manifest / Range (IIIF) + * + * @return string The OCR full text + */ + protected function getFullTextFromXml($id) + { + $fullText = ''; + // Load available text formats, ... + $this->loadFormats(); + // ... physical structure ... + $this->_getPhysicalStructure(); + // ... and extension configuration. + $extConf = unserialize($GLOBALS['TYPO3_CONF_VARS']['EXT']['extConf'][self::$extKey]); + $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']); + if (!empty($this->physicalStructureInfo[$id])) { + while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { + if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { + // Get full text file. + $file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); + if ($file !== false) { + // Turn off libxml's error logging. + $libxmlErrors = libxml_use_internal_errors(true); + // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. + $previousValueOfEntityLoader = libxml_disable_entity_loader(true); + // Load XML from file. + $rawTextXml = simplexml_load_string($file); + // Reset entity loader setting. + libxml_disable_entity_loader($previousValueOfEntityLoader); + // Reset libxml's error logging. + libxml_use_internal_errors($libxmlErrors); + // Get the root element's name as text format. + $textFormat = strtoupper($rawTextXml->getName()); + } else { + $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"'); + return $fullText; + } + break; + } + } + } else { + $this->logger->warning('Invalid structure node @ID "' . $id . '"'); + return $fullText; + } + // Is this text format supported? + // This part actually differs from previous version of indexed OCR + if (!empty($file) && !empty($this->formats[$textFormat])) { + if (!empty($this->formats[$textFormat]['class'])) { + $fullText = $file; + } + } else { + $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); + } + return $fullText; + } + /** * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas. Text might be * given as ALTO for METS or as annotations or ALTO for IIIF resources. If IIIF plain text annotations @@ -652,6 +729,7 @@ public function getPhysicalPage($logicalPage) * * @return string The physical structure node's / IIIF resource's raw text */ + //TODO: check if this method is still needed somewhere, if not simply replace with getFullText public abstract function getRawText($id); /** @@ -1301,6 +1379,7 @@ public function save($pid = 0, $core = 0) } // Add document to index. if ($core) { + //TODO: handling if this method returns failure Indexer::add($this, $core); } else { $this->logger->notice('Invalid UID "' . $core . '" for Solr core'); diff --git a/Classes/Common/Indexer.php b/Classes/Common/Indexer.php index 32f1fa26a..6d4f1feaa 100644 --- a/Classes/Common/Indexer.php +++ b/Classes/Common/Indexer.php @@ -116,15 +116,16 @@ public static function add(Document &$doc, $core = 0) $updateQuery = self::$solr->service->createUpdate(); $updateQuery->addDeleteQuery('uid:' . $doc->uid); self::$solr->service->update($updateQuery); + //TODO: handle problem with indexing documents without OCR // Index every logical unit as separate Solr document. - foreach ($doc->tableOfContents as $logicalUnit) { + /*foreach ($doc->tableOfContents as $logicalUnit) { if (!$errors) { $errors = self::processLogical($doc, $logicalUnit); } else { break; } - } - // Index fulltext files if available. + }*/ + // Index full text files if available. if ($doc->hasFulltext) { foreach ($doc->physicalStructure as $pageNumber => $xmlId) { if (!$errors) { @@ -315,6 +316,8 @@ protected static function loadIndexConf($pid) */ protected static function processLogical(Document &$doc, array $logicalUnit) { + $logger = GeneralUtility::makeInstance('TYPO3\CMS\Core\Log\LogManager')->getLogger(__CLASS__); + $errors = 0; // Get metadata for logical unit. $metadata = $doc->metadataArray[$logicalUnit['id']]; @@ -406,6 +409,7 @@ protected static function processLogical(Document &$doc, array $logicalUnit) 'core.template.flashMessages' ); } + $logger->error('Apache Solr threw exception: "' . $e->getMessage() . '"'); return 1; } } @@ -436,10 +440,10 @@ protected static function processLogical(Document &$doc, array $logicalUnit) */ protected static function processPhysical(Document &$doc, $page, array $physicalUnit) { - if ( - $doc->hasFulltext - && $fulltext = $doc->getRawText($physicalUnit['id']) - ) { + $logger = GeneralUtility::makeInstance('TYPO3\CMS\Core\Log\LogManager')->getLogger(__CLASS__); + + $logger->error($doc->hasFulltext && $fullText = $doc->getFullText($physicalUnit['id'])); + if ($doc->hasFulltext && $fullText = $doc->getFullText($physicalUnit['id'])) { // Read extension configuration. $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey); // Create new Solr document. @@ -463,7 +467,7 @@ protected static function processPhysical(Document &$doc, $page, array $physical $solrDoc->setField('toplevel', false); $solrDoc->setField('type', $physicalUnit['type'], self::$fields['fieldboost']['type']); $solrDoc->setField('collection', $doc->metadataArray[$doc->toplevelId]['collection']); - $solrDoc->setField('fulltext', htmlspecialchars($fulltext)); + $solrDoc->setField('fulltext', $fullText); // Add faceting information to physical sub-elements if applicable. foreach ($doc->metadataArray[$doc->toplevelId] as $index_name => $data) { if ( @@ -503,6 +507,7 @@ protected static function processPhysical(Document &$doc, $page, array $physical true, 'core.template.flashMessages' ); + $logger->error('Apache Solr threw exception: "' . $e->getMessage() . '"'); } return 1; } diff --git a/Classes/Common/MetsDocument.php b/Classes/Common/MetsDocument.php index ab8045e21..4bbbb4561 100644 --- a/Classes/Common/MetsDocument.php +++ b/Classes/Common/MetsDocument.php @@ -666,6 +666,22 @@ public function getRawText($id) return $rawText; } + /** + * {@inheritDoc} + * @see \Kitodo\Dlf\Common\Document::getFullText() + */ + public function getFullText($id) + { + $fullText = ''; + + // Load fileGrps and check for full text files. + $this->_getFileGrps(); + if ($this->hasFulltext) { + $fullText = $this->getFullTextFromXml($id); + } + return $fullText; + } + /** * {@inheritDoc} * @see Document::getStructureDepth() From 2af9f707b7a8285f15f5f91e73e010b7ff0517c4 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Wed, 28 Apr 2021 16:53:58 +0200 Subject: [PATCH 22/51] Remove getRawText methods --- Classes/Common/Document.php | 124 ++++----------------------- Classes/Common/FulltextInterface.php | 1 + Classes/Common/IiifManifest.php | 7 +- Classes/Common/MetsDocument.php | 19 ---- 4 files changed, 23 insertions(+), 128 deletions(-) diff --git a/Classes/Common/Document.php b/Classes/Common/Document.php index d0d4169d2..3e18127d6 100644 --- a/Classes/Common/Document.php +++ b/Classes/Common/Document.php @@ -677,20 +677,9 @@ protected function getFullTextFromXml($id) while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { // Get full text file. - $file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); - if ($file !== false) { - // Turn off libxml's error logging. - $libxmlErrors = libxml_use_internal_errors(true); - // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. - $previousValueOfEntityLoader = libxml_disable_entity_loader(true); - // Load XML from file. - $rawTextXml = simplexml_load_string($file); - // Reset entity loader setting. - libxml_disable_entity_loader($previousValueOfEntityLoader); - // Reset libxml's error logging. - libxml_use_internal_errors($libxmlErrors); - // Get the root element's name as text format. - $textFormat = strtoupper($rawTextXml->getName()); + $fileContent = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); + if ($fileContent !== false) { + $textFormat = $this->getTextFormat($fileContent); } else { $this->logger->warning('Couldn\'t load full text file for structure node @ID "' . $id . '"'); return $fullText; @@ -704,105 +693,28 @@ protected function getFullTextFromXml($id) } // Is this text format supported? // This part actually differs from previous version of indexed OCR - if (!empty($file) && !empty($this->formats[$textFormat])) { - if (!empty($this->formats[$textFormat]['class'])) { - $fullText = $file; - } + if (!empty($fileContent) && !empty($this->formats[$textFormat])) { + $fullText = $fileContent; } else { $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); } return $fullText; } - /** - * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas. Text might be - * given as ALTO for METS or as annotations or ALTO for IIIF resources. If IIIF plain text annotations - * with the motivation "painting" should be treated as full text representations, the extension has to be - * configured accordingly. - * - * @access public - * - * @abstract - * - * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property - * of the Manifest / Range (IIIF) - * - * @return string The physical structure node's / IIIF resource's raw text - */ - //TODO: check if this method is still needed somewhere, if not simply replace with getFullText - public abstract function getRawText($id); - - /** - * This extracts the raw text for a physical structure node / IIIF Manifest / Canvas from an - * XML fulltext representation (currently only ALTO). For IIIF manifests, ALTO documents have - * to be given in the Canvas' / Manifest's "seeAlso" property. - * - * @param string $id: The @ID attribute of the physical structure node (METS) or the @id property - * of the Manifest / Range (IIIF) - * - * @return string The physical structure node's / IIIF resource's raw text from XML - */ - protected function getRawTextFromXml($id) + private function getTextFormat($fileContent) { - $rawText = ''; - // Load available text formats, ... - $this->loadFormats(); - // ... physical structure ... - $this->_getPhysicalStructure(); - // ... and extension configuration. - $extConf = GeneralUtility::makeInstance(ExtensionConfiguration::class)->get(self::$extKey); - $fileGrpsFulltext = GeneralUtility::trimExplode(',', $extConf['fileGrpFulltext']); - if (!empty($this->physicalStructureInfo[$id])) { - while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { - if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - // Get fulltext file. - $file = GeneralUtility::getUrl($this->getFileLocation($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])); - if ($file !== false) { - // Turn off libxml's error logging. - $libxmlErrors = libxml_use_internal_errors(true); - // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. - $previousValueOfEntityLoader = libxml_disable_entity_loader(true); - // Load XML from file. - $rawTextXml = simplexml_load_string($file); - // Reset entity loader setting. - libxml_disable_entity_loader($previousValueOfEntityLoader); - // Reset libxml's error logging. - libxml_use_internal_errors($libxmlErrors); - // Get the root element's name as text format. - $textFormat = strtoupper($rawTextXml->getName()); - } else { - $this->logger->warning('Couldn\'t load fulltext file for structure node @ID "' . $id . '"'); - return $rawText; - } - break; - } - } - } else { - $this->logger->warning('Invalid structure node @ID "' . $id . '"'); - return $rawText; - } - // Is this text format supported? - if ( - !empty($rawTextXml) - && !empty($this->formats[$textFormat]) - ) { - if (!empty($this->formats[$textFormat]['class'])) { - $class = $this->formats[$textFormat]['class']; - // Get the raw text from class. - if ( - class_exists($class) - && ($obj = GeneralUtility::makeInstance($class)) instanceof FulltextInterface - ) { - $rawText = $obj->getRawText($rawTextXml); - $this->rawTextArray[$id] = $rawText; - } else { - $this->logger->warning('Invalid class/method "' . $class . '->getRawText()" for text format "' . $textFormat . '"'); - } - } - } else { - $this->logger->warning('Unsupported text format "' . $textFormat . '" in physical node with @ID "' . $id . '"'); - } - return $rawText; + // Turn off libxml's error logging. + $libxmlErrors = libxml_use_internal_errors(true); + // Disables the functionality to allow external entities to be loaded when parsing the XML, must be kept. + $previousValueOfEntityLoader = libxml_disable_entity_loader(true); + // Load XML from file. + $rawTextXml = simplexml_load_string($fileContent); + // Reset entity loader setting. + libxml_disable_entity_loader($previousValueOfEntityLoader); + // Reset libxml's error logging. + libxml_use_internal_errors($libxmlErrors); + // Get the root element's name as text format. + return strtoupper($rawTextXml->getName()); } /** diff --git a/Classes/Common/FulltextInterface.php b/Classes/Common/FulltextInterface.php index 68d10f17a..755b7ae53 100644 --- a/Classes/Common/FulltextInterface.php +++ b/Classes/Common/FulltextInterface.php @@ -21,6 +21,7 @@ * @access public * @abstract */ +//TODO: check if this is still needed when actually full text xml is indexed interface FulltextInterface { /** diff --git a/Classes/Common/IiifManifest.php b/Classes/Common/IiifManifest.php index 6153e7b6a..1764ad07a 100644 --- a/Classes/Common/IiifManifest.php +++ b/Classes/Common/IiifManifest.php @@ -786,9 +786,10 @@ protected function getParentDocumentUidForSaving($pid, $core) /** * {@inheritDoc} - * @see Document::getRawText() + * @see Document::getFullText() */ - public function getRawText($id) + //TODO: rewrite it to get full OCR + public function getFullText($id) { $rawText = ''; // Get text from raw text array if available. @@ -805,7 +806,7 @@ public function getRawText($id) if (!empty($this->physicalStructureInfo[$id])) { while ($fileGrpFulltext = array_shift($fileGrpsFulltext)) { if (!empty($this->physicalStructureInfo[$id]['files'][$fileGrpFulltext])) { - $rawText = parent::getRawTextFromXml($id); + $rawText = parent::getFullTextFromXml($id); break; } } diff --git a/Classes/Common/MetsDocument.php b/Classes/Common/MetsDocument.php index 4bbbb4561..d0c22ce3b 100644 --- a/Classes/Common/MetsDocument.php +++ b/Classes/Common/MetsDocument.php @@ -647,25 +647,6 @@ class_exists($class) } } - /** - * {@inheritDoc} - * @see \Kitodo\Dlf\Common\Document::getRawText() - */ - public function getRawText($id) - { - $rawText = ''; - // Get text from raw text array if available. - if (!empty($this->rawTextArray[$id])) { - return $this->rawTextArray[$id]; - } - // Load fileGrps and check for fulltext files. - $this->_getFileGrps(); - if ($this->hasFulltext) { - $rawText = $this->getRawTextFromXml($id); - } - return $rawText; - } - /** * {@inheritDoc} * @see \Kitodo\Dlf\Common\Document::getFullText() From 864e802197173d049644cc53d4303ed3e1e37cdd Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Thu, 29 Apr 2021 10:41:21 +0200 Subject: [PATCH 23/51] Return false from save method if documents were not indexed Either because of incorrect solr core or because some errors while indexing --- Classes/Common/Document.php | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/Classes/Common/Document.php b/Classes/Common/Document.php index 3e18127d6..5b2796476 100644 --- a/Classes/Common/Document.php +++ b/Classes/Common/Document.php @@ -1291,10 +1291,14 @@ public function save($pid = 0, $core = 0) } // Add document to index. if ($core) { - //TODO: handling if this method returns failure - Indexer::add($this, $core); + //TODO: change return of this method to true on success and false on failure + $hasErrors = Indexer::add($this, $core); + if ($hasErrors) { + return false; + } } else { $this->logger->notice('Invalid UID "' . $core . '" for Solr core'); + return false; } return true; } From f6c7ae5d7d9855d9c852d706927f72c8e18eef05 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Mon, 17 May 2021 14:43:48 +0200 Subject: [PATCH 24/51] Fix after rebase class for logger in Document instance --- Classes/Common/Document.php | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Classes/Common/Document.php b/Classes/Common/Document.php index 5b2796476..2fd25d0b5 100644 --- a/Classes/Common/Document.php +++ b/Classes/Common/Document.php @@ -570,9 +570,8 @@ public static function &getInstance($uid, $pid = 0, $forceReload = false) if (!empty($extConf['caching'])) { Helper::saveToSession(self::$registry, get_class($instance)); } - $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(__CLASS__); + $instance->logger = GeneralUtility::makeInstance(LogManager::class)->getLogger(get_class($instance)); } - // Return new instance. return $instance; } From b3a35bf446322a4273f41e5dba36627fa17c22c2 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Mon, 17 May 2021 18:24:06 +0200 Subject: [PATCH 25/51] Highlight more than one word in search --- Resources/Public/Javascript/PageView/PageView.js | 2 +- Resources/Public/Javascript/PageView/SearchInDocument.js | 5 +---- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/Resources/Public/Javascript/PageView/PageView.js b/Resources/Public/Javascript/PageView/PageView.js index 3457ff041..181ecff76 100644 --- a/Resources/Public/Javascript/PageView/PageView.js +++ b/Resources/Public/Javascript/PageView/PageView.js @@ -350,7 +350,7 @@ dlfViewer.prototype.displayHighlightWord = function() { if (hasOwnProperty && this.fulltexts[0] !== undefined && this.fulltexts[0].url !== '' && this.images.length > 0) { var value = urlParams[param], - values = value.split(';'), + values = decodeURIComponent(value).split(' '), fulltextData = dlfFullTextUtils.fetchFullTextDataFromServer(this.fulltexts[0].url, this.images[0]), fulltextDataImageTwo = undefined; diff --git a/Resources/Public/Javascript/PageView/SearchInDocument.js b/Resources/Public/Javascript/PageView/SearchInDocument.js index 68494a339..9ccfe69d1 100644 --- a/Resources/Public/Javascript/PageView/SearchInDocument.js +++ b/Resources/Public/Javascript/PageView/SearchInDocument.js @@ -131,9 +131,6 @@ function getAllQueryParams(baseUrl, queryParams) { * @returns {array} array with params in form 'param' => 'value' */ function getNeededQueryParams(element) { - var searchWord = element['snippet']; - searchWord = searchWord.substring(searchWord.indexOf('') + 4, searchWord.indexOf('')); - var id = $("input[id='tx-dlf-search-in-document-id']").attr('name'); var highlightWord = $("input[id='tx-dlf-search-in-document-highlight-word']").attr('name'); var page = $("input[id='tx-dlf-search-in-document-page']").attr('name'); @@ -145,7 +142,7 @@ function getNeededQueryParams(element) { queryParams[id] = element['uid']; } queryParams.push(highlightWord); - queryParams[highlightWord] = encodeURIComponent(searchWord); + queryParams[highlightWord] = encodeURIComponent($("input[id='tx-dlf-search-in-document-query']").val()); queryParams.push(page); queryParams[page] = element['page']; From 50e95c3e6a8c032f52173ef37cfa30479aba939f Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Mon, 17 May 2021 18:31:42 +0200 Subject: [PATCH 26/51] Preserve search phrase and get hit list after hit link was clicked --- .../Javascript/PageView/SearchInDocument.js | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Resources/Public/Javascript/PageView/SearchInDocument.js b/Resources/Public/Javascript/PageView/SearchInDocument.js index 9ccfe69d1..3b85e678c 100644 --- a/Resources/Public/Javascript/PageView/SearchInDocument.js +++ b/Resources/Public/Javascript/PageView/SearchInDocument.js @@ -189,6 +189,20 @@ function getNavigationButtons(start, numFound) { return buttons; } +function triggerSearchAfterHitLoad() { + var queryParams = getCurrentQueryParams(getBaseUrl(" ")); + + for(var i = 0; i < queryParams.length; i++) { + var queryParam = queryParams[i].split('='); + + if(queryParam[0].indexOf($("input[id='tx-dlf-search-in-document-highlight-word']").attr('name')) !== -1) { + $("input[id='tx-dlf-search-in-document-query']").val(decodeURIComponent(queryParam[1])); + $("#tx-dlf-search-in-document-form").submit(); + break; + } + } +} + $(document).ready(function() { $("#tx-dlf-search-in-document-form").submit(function(event) { // Stop form from submitting normally @@ -255,4 +269,6 @@ $(document).ready(function() { $('.results-active-indicator').remove(); $('#tx-dlf-search-in-document-query').val(''); }); + + triggerSearchAfterHitLoad(); }); From 16e27eb759ad1a233582c3af7401beee8de48f82 Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Tue, 18 May 2021 13:14:27 +0200 Subject: [PATCH 27/51] Index documents with empty fulltext field --- Classes/Common/Indexer.php | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Classes/Common/Indexer.php b/Classes/Common/Indexer.php index 6d4f1feaa..7a31028e8 100644 --- a/Classes/Common/Indexer.php +++ b/Classes/Common/Indexer.php @@ -116,15 +116,15 @@ public static function add(Document &$doc, $core = 0) $updateQuery = self::$solr->service->createUpdate(); $updateQuery->addDeleteQuery('uid:' . $doc->uid); self::$solr->service->update($updateQuery); - //TODO: handle problem with indexing documents without OCR + // Index every logical unit as separate Solr document. - /*foreach ($doc->tableOfContents as $logicalUnit) { + foreach ($doc->tableOfContents as $logicalUnit) { if (!$errors) { $errors = self::processLogical($doc, $logicalUnit); } else { break; } - }*/ + } // Index full text files if available. if ($doc->hasFulltext) { foreach ($doc->physicalStructure as $pageNumber => $xmlId) { @@ -360,6 +360,7 @@ protected static function processLogical(Document &$doc, array $logicalUnit) $solrDoc->setField('terms', $metadata['terms']); $solrDoc->setField('restrictions', $metadata['restrictions']); $solrDoc->setField('collection', $doc->metadataArray[$doc->toplevelId]['collection']); + $solrDoc->setField('fulltext', ''); $coordinates = json_decode($metadata['coordinates'][0]); if (is_object($coordinates)) { $solrDoc->setField('geom', json_encode($coordinates->features[0])); From 431f9805ce2b22c85f8e1cfbc709334c26bb592c Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Mon, 28 Jun 2021 15:57:47 +0200 Subject: [PATCH 28/51] Use coordinates inside the function for searching full text features --- .../Public/Javascript/PageView/PageView.js | 26 ++++++-- .../Javascript/PageView/SearchInDocument.js | 64 +++++++++++++++++++ .../Public/Javascript/PageView/Utility.js | 10 +-- 3 files changed, 91 insertions(+), 9 deletions(-) diff --git a/Resources/Public/Javascript/PageView/PageView.js b/Resources/Public/Javascript/PageView/PageView.js index 181ecff76..05636ae40 100644 --- a/Resources/Public/Javascript/PageView/PageView.js +++ b/Resources/Public/Javascript/PageView/PageView.js @@ -80,7 +80,13 @@ var dlfViewer = function(settings){ * @type {string} * @private */ - this.highlightKeys = 'tx_dlf[highlight_word]'; + this.highlightKeys = 'hl'; + + /** + * @type {string|undefined} + * @private + */ + this.highlightWords = null; /** * @type {Object|undefined} @@ -292,7 +298,10 @@ dlfViewer.prototype.createControls_ = function(controlNames, layers) { /** * Displays highlight words */ -dlfViewer.prototype.displayHighlightWord = function() { +dlfViewer.prototype.displayHighlightWord = function(highlightWords = null) { + if(highlightWords != null) { + this.highlightWords = highlightWords; + } if (!dlfUtils.exists(this.highlightLayer)) { @@ -349,11 +358,18 @@ dlfViewer.prototype.displayHighlightWord = function() { } if (hasOwnProperty && this.fulltexts[0] !== undefined && this.fulltexts[0].url !== '' && this.images.length > 0) { - var value = urlParams[param], - values = decodeURIComponent(value).split(' '), + var value = undefined, fulltextData = dlfFullTextUtils.fetchFullTextDataFromServer(this.fulltexts[0].url, this.images[0]), fulltextDataImageTwo = undefined; + if(this.highlightWords != null) { + value = this.highlightWords; + } else { + value = urlParams[param]; + } + + var values = decodeURIComponent(value).split(';'), + // check if there is another image / fulltext to look for if (this.images.length === 2 & this.fulltexts[1] !== undefined && this.fulltexts[1].url !== '') { var image = $.extend({}, this.images[1]); @@ -364,7 +380,7 @@ dlfViewer.prototype.displayHighlightWord = function() { var stringFeatures = fulltextDataImageTwo === undefined ? fulltextData.getStringFeatures() : fulltextData.getStringFeatures().concat(fulltextDataImageTwo.getStringFeatures()); values.forEach($.proxy(function(value) { - var features = dlfUtils.searchFeatureCollectionForText(stringFeatures, value); + var features = dlfUtils.searchFeatureCollectionForCoordinates(stringFeatures, value); if (features !== undefined) { for (var i = 0; i < features.length; i++) { this.highlightLayer.getSource().addFeatures([features[i]]); diff --git a/Resources/Public/Javascript/PageView/SearchInDocument.js b/Resources/Public/Javascript/PageView/SearchInDocument.js index 3b85e678c..0d9f5296b 100644 --- a/Resources/Public/Javascript/PageView/SearchInDocument.js +++ b/Resources/Public/Javascript/PageView/SearchInDocument.js @@ -145,10 +145,28 @@ function getNeededQueryParams(element) { queryParams[highlightWord] = encodeURIComponent($("input[id='tx-dlf-search-in-document-query']").val()); queryParams.push(page); queryParams[page] = element['page']; + queryParams.push('hl'); + queryParams['hl'] = encodeURIComponent(getHighlights(element['highlight'])); return queryParams; } +function getHighlights(highlight) { + var highlights = ""; + + for(var i = 0; i < highlight.length; i++) { + if (highlights === "") { + highlights += highlight[i]; + } else { + if(highlights.indexOf(highlight[i]) === -1) { + highlights += ';' + highlight[i]; + } + } + } + + return highlights; +} + /** * Get snippet link. * @@ -189,6 +207,52 @@ function getNavigationButtons(start, numFound) { return buttons; } +function getCurrentPage() { + var page = 1; + var queryParams = getCurrentQueryParams(getBaseUrl(" ")); + + for(var i = 0; i < queryParams.length; i++) { + var queryParam = queryParams[i].split('='); + + if(queryParam[0] === $("input[id='tx-dlf-search-in-document-page']").attr('name')) { + page = queryParam[1]; + } + } + + return page; +} + +function addImageHighlightAfterFirstLoad(data) { + var queryParams = getCurrentQueryParams(getBaseUrl(" ")); + var hlParameterFound = false; + + for(var i = 0; i < queryParams.length; i++) { + var queryParam = queryParams[i].split('='); + + if(queryParam[0] === 'hl') { + hlParameterFound = true; + break; + } + } + + if(!hlParameterFound && data['numFound'] > 0) { + var page = getCurrentPage(); + + data['documents'].forEach(function (element, i) { + if(element['page'] == page) { + if (element['highlight'].length > 0) { + if(tx_dlf_viewer.map != null) { + tx_dlf_viewer.displayHighlightWord(encodeURIComponent(getHighlights(element['highlight']))); + } else { + setTimeout(addImageHighlightAfterFirstLoad, 500, data); + } + } + addHighlightEffect(element['highlight']) + } + }); + } +} + function triggerSearchAfterHitLoad() { var queryParams = getCurrentQueryParams(getBaseUrl(" ")); diff --git a/Resources/Public/Javascript/PageView/Utility.js b/Resources/Public/Javascript/PageView/Utility.js index d731be7cf..21d17c6b2 100644 --- a/Resources/Public/Javascript/PageView/Utility.js +++ b/Resources/Public/Javascript/PageView/Utility.js @@ -865,16 +865,18 @@ dlfUtils.scaleToImageSize = function (features, imageObj, width, height, opt_off }; /** - * Search a feature collcetion for a feature with the given text + * Search a feature collection for a feature with the given coordinates * @param {Array.} featureCollection - * @param {string} text + * @param {string} coordinates * @return {Array.|undefined} */ -dlfUtils.searchFeatureCollectionForText = function (featureCollection, text) { +dlfUtils.searchFeatureCollectionForCoordinates = function (featureCollection, coordinates) { var features = []; featureCollection.forEach(function (ft) { if (ft.get('fulltext') !== undefined) { - if (ft.get('fulltext').toLowerCase().indexOf(text.toLowerCase()) > -1) features.push(ft); + if ((ft.get('width') + '_' + ft.get('height') + '_' + ft.get('hpos') + '_' + ft.get('vpos')) == coordinates) { + features.push(ft); + } } }); return features.length > 0 ? features : undefined; From f65ed446ace385ab407e7fa54dc5335081421cca Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Mon, 28 Jun 2021 16:20:56 +0200 Subject: [PATCH 29/51] Fix Codacy errors --- .../Javascript/PageView/SearchInDocument.js | 36 +++++++++---------- .../Public/Javascript/PageView/Utility.js | 2 +- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/Resources/Public/Javascript/PageView/SearchInDocument.js b/Resources/Public/Javascript/PageView/SearchInDocument.js index 0d9f5296b..830eaf92b 100644 --- a/Resources/Public/Javascript/PageView/SearchInDocument.js +++ b/Resources/Public/Javascript/PageView/SearchInDocument.js @@ -83,6 +83,22 @@ function getBaseUrl(id) { return baseUrl; } +function getHighlights(highlight) { + var highlights = ""; + + for(var i = 0; i < highlight.length; i++) { + if (highlights === "") { + highlights += highlight[i]; + } else { + if(highlights.indexOf(highlight[i]) === -1) { + highlights += ';' + highlight[i]; + } + } + } + + return highlights; +} + /** * Get current URL query parameters. * It returns array of params in form 'param=value' if there are any params supplied in the given url. If there are none it returns empty array @@ -151,22 +167,6 @@ function getNeededQueryParams(element) { return queryParams; } -function getHighlights(highlight) { - var highlights = ""; - - for(var i = 0; i < highlight.length; i++) { - if (highlights === "") { - highlights += highlight[i]; - } else { - if(highlights.indexOf(highlight[i]) === -1) { - highlights += ';' + highlight[i]; - } - } - } - - return highlights; -} - /** * Get snippet link. * @@ -239,7 +239,7 @@ function addImageHighlightAfterFirstLoad(data) { var page = getCurrentPage(); data['documents'].forEach(function (element, i) { - if(element['page'] == page) { + if(element['page'] === page) { if (element['highlight'].length > 0) { if(tx_dlf_viewer.map != null) { tx_dlf_viewer.displayHighlightWord(encodeURIComponent(getHighlights(element['highlight']))); @@ -247,7 +247,7 @@ function addImageHighlightAfterFirstLoad(data) { setTimeout(addImageHighlightAfterFirstLoad, 500, data); } } - addHighlightEffect(element['highlight']) + addHighlightEffect(element['highlight']); } }); } diff --git a/Resources/Public/Javascript/PageView/Utility.js b/Resources/Public/Javascript/PageView/Utility.js index 21d17c6b2..48cc51719 100644 --- a/Resources/Public/Javascript/PageView/Utility.js +++ b/Resources/Public/Javascript/PageView/Utility.js @@ -874,7 +874,7 @@ dlfUtils.searchFeatureCollectionForCoordinates = function (featureCollection, co var features = []; featureCollection.forEach(function (ft) { if (ft.get('fulltext') !== undefined) { - if ((ft.get('width') + '_' + ft.get('height') + '_' + ft.get('hpos') + '_' + ft.get('vpos')) == coordinates) { + if ((ft.get('width') + '_' + ft.get('height') + '_' + ft.get('hpos') + '_' + ft.get('vpos')) === coordinates) { features.push(ft); } } From 5af13b5ac3338bf0768caea1fecb886a15d1556d Mon Sep 17 00:00:00 2001 From: Beatrycze Volk Date: Wed, 30 Jun 2021 17:27:38 +0200 Subject: [PATCH 30/51] Fix request handler in solr config --- .../configsets/dlf/conf/solrconfig.xml | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml b/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml index f6b09bbb9..198f2e94d 100644 --- a/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml +++ b/Configuration/ApacheSolr/configsets/dlf/conf/solrconfig.xml @@ -685,7 +685,13 @@ provided with Solr is "SearchHandler" It delegates to a sequent of SearchComponents (see below) and supports distributed queries across multiple shards - --> + + Instruct the request handlers you want to enable OCR highlighting for to include the + search component you defined above. This example uses the standard /select handler. + + CAUTION: Make sure that the OCR highlight component is listed **before** the standard + highlighting component, but **after** the query component. + --> + + + query + ocrHighlight + highlight + - - - - query - ocrHighlight - highlight - - -