HOME


Mini Shell 1.0
DIR: /home/dhnidqcz/journal.pragmaticsng.org/lib__47455f6/pkp/classes/search/
Upload File :
Current File : //home/dhnidqcz/journal.pragmaticsng.org/lib__47455f6/pkp/classes/search/SubmissionSearchIndex.php
<?php

/**
 * @file classes/search/SubmissionSearchIndex.php
 *
 * Copyright (c) 2014-2021 Simon Fraser University
 * Copyright (c) 2003-2021 John Willinsky
 * Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
 *
 * @class SubmissionSearchIndex
 *
 * @ingroup search
 *
 * @brief Class to maintain a submission search index.
 */

namespace PKP\search;

use APP\submission\Submission;
use PKP\config\Config;
use PKP\core\PKPString;

abstract class SubmissionSearchIndex
{
    public const SEARCH_STOPWORDS_FILE = 'lib/pkp/registry/stopwords.txt';

    // Words are truncated to at most this length
    public const SEARCH_KEYWORD_MAX_LENGTH = 40;

    /**
     * Split a string into a clean array of keywords
     *
     * @param string|array $text
     * @param bool $allowWildcards
     *
     * @return string[] of keywords
     */
    public static function filterKeywords($text, $allowWildcards = false, bool $allowShortWords = false, bool $allowNumericWords = false): array
    {
        $minLength = Config::getVar('search', 'min_word_length');
        $stopwords = static::loadStopwords();

        // Join multiple lines into a single string
        if (is_array($text)) {
            $text = join("\n", $text);
        }

        // Attempts to fix bad UTF-8 characters
        $previous = mb_substitute_character();
        mb_substitute_character('none');
        $text = mb_convert_encoding($text, 'UTF-8', 'UTF-8');
        mb_substitute_character($previous);

        // Removes all control (C) characters, marks (M), punctuations (P), symbols (S) and separators (Z) except "*" (which is addressed below)
        $text = PKPString::regexp_replace('/(?!\*)[\\p{C}\\p{M}\\p{P}\\p{S}\\p{Z}]+/', ' ', $text);
        $text = PKPString::regexp_replace('/[\*]/', $allowWildcards ? '%' : ' ', $text);
        $text = PKPString::strtolower($text);

        // Split into words
        $words = PKPString::regexp_split('/\s+/', $text);

        // FIXME Do not perform further filtering for some fields, e.g., author names?

        $keywords = [];
        foreach ($words as $word) {
            // Ignores: stop words, short words (when $allowShortWords is false) and words composed solely of numbers (when $allowNumericWords is false)
            if (empty($stopwords[$word]) && ($allowShortWords || PKPString::strlen($word) >= $minLength) && ($allowNumericWords || !is_numeric($word))) {
                $keywords[] = PKPString::substr($word, 0, static::SEARCH_KEYWORD_MAX_LENGTH);
            }
        }
        return $keywords;
    }

    /**
     * Return list of stopwords.
     * FIXME: Should this be locale-specific?
     *
     * @return array<string,int> Stop words (in lower case) as keys and 1 as value
     */
    protected static function loadStopwords()
    {
        static $searchStopwords;

        return $searchStopwords ??= array_fill_keys(
            collect(file(base_path(static::SEARCH_STOPWORDS_FILE)))
                ->map(fn (string $word) => trim($word))
                // Ignore comments/line-breaks
                ->filter(fn (string $word) => !empty($word) && $word[0] !== '#')
                // Include a map for empty words
                ->push('')
                ->toArray(),
            1
        );
    }

    /**
     * Let the indexing back-end know that the current transaction
     * finished so that the index can be batch-updated.
     */
    abstract public function submissionChangesFinished();

    /**
     * Signal to the indexing back-end that the metadata of a submission
     * changed.
     *
     * Push indexing implementations will try to immediately update
     * the index to reflect the changes. Pull implementations will
     * mark articles as "changed" and let the indexing back-end decide
     * the best point in time to actually index the changed data.
     *
     * @param Submission $submission
     */
    abstract public function submissionMetadataChanged($submission);

    /**
     * Remove indexed file contents for a submission
     *
     * @param Submission $submission
     */
    abstract public function clearSubmissionFiles($submission);

    /**
     * Delete a submission's search indexing
     *
     * @param int $type optional
     * @param int $assocId optional
     */
    abstract public function deleteTextIndex(
        int $submissionId,
        $type = null,
        $assocId = null
    );
}

if (!PKP_STRICT_MODE) {
    class_alias('\PKP\search\SubmissionSearchIndex', '\SubmissionSearchIndex');
    foreach (['SEARCH_STOPWORDS_FILE', 'SEARCH_KEYWORD_MAX_LENGTH'] as $constantName) {
        define($constantName, constant('\SubmissionSearchIndex::' . $constantName));
    }
}