<?php
/**
* @file tools/convertApacheAccessLogFile.php
*
* Copyright (c) 2022 Simon Fraser University
* Copyright (c) 2022 John Willinsky
* Distributed under the GNU GPL v3. For full terms see the file docs/COPYING.
*
* @class ConvertApacheAccessLogFile
*
* @ingroup tools
*
* @brief CLI tool to copy, prepare and convert apache access log file into the new format needed for stats reprocessing.
*
* The file will be copied to the {files_dir}/usageStats/tmp/ folder,
* only entries related to the current instalation will be filtered,
* the file will be spit by day,
* renamed into apache_usage_events_YYYYMMDD.log,
* converted to the new format,
* and copied into the {files_dir}/usageStats/archive/ folder.
*
* Special cases from the release 2.x are handled as following:
*
* Issue Galley:
* with PDF viewer:
* issue/viewIssue/issueId/galleyId followed by issue/viewFile/issueId/galleyId
* -> only issue/viewFile/issueId/galleyId will be considered.
* There is also only issue/download/issueId/galleyId (when download link is used).
* without PDF viewer:
* issue/viewIssue/issueId/galleyId will not be considered because the file is actually not downloaded.
* But issue/download/issueId/galleyId will be considered.
*
* PDF Galley:
* article/view/articleId/galleyId followed by article/viewFile/articleId/galleyId
* -> only article/viewFile/articleId/galleyId will be considered.
* There is also only article/donwload/articleId/galleyId (when download link is used.
* without PDF viewer:
* article/view/articleId/galleyId will not be considered because the file is actually not downloaded.
* But article/download/articleId/galleyId will be considered.
*
* HMTL Galley:
* article/view/articleId/galleyId followed by article/viewFile/articleId/galleyId
* -> only article/viewFile/articleId/galleyId will be considered.
*
* Other and Remote Galley:
* article/view/articleId/galleyId
*
* Supp File:
* article/downloadSuppFile/articleId/galleyId
*/
require(dirname(__FILE__, 4) . '/tools/bootstrap.php');
use APP\core\Application;
use APP\facades\Repo;
use APP\statistics\StatisticsHelper;
use PKP\cliTool\ConvertLogFileTool;
use PKP\context\Context;
use PKP\db\DAORegistry;
use PKP\file\FileManager;
use PKP\statistics\PKPStatisticsHelper;
use PKP\submission\Genre;
use PKP\task\FileLoader;
class ConvertApacheAccessLogFile extends ConvertLogFileTool
{
/**
* Path to the egrep program, required for this tool to work, e.g. '/bin/egrep'
*/
public const EGREP_PATH = '/bin/egrep';
/**
* Weather the URL parameters are used instead of CGI PATH_INFO.
* This is the former variable 'disable_path_info' in the config.inc.php
*
* This needs to be set to true if the URLs in the old log file contain the paramteres as URL query string.
*/
public const PATH_INFO_DISABLED = false;
/**
* Regular expression that is used for parsing the apache access log file.
*
* The default regex can parse apache access log file in combined format
* ("%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\"").
*
* If the apache log file is in different format the correct regex needs to be entered here, so
* that ip, date, url, returnCode, and userAgent can be extracted,
* s. also PHP subpatterns naming: https://www.php.net/manual/en/regexp.reference.subpatterns.php
*/
public const PARSEREGEX = '/^(?P<ip>\S+) \S+ \S+ \[(?P<date>.*?)\] "\S+ (?P<url>\S+).*?" (?P<returnCode>\S+) \S+ ".*?" "(?P<userAgent>.*?)"/';
/**
* PHP format of the time in the log file.
* S. https://www.php.net/manual/en/datetime.format.php
*
* The default format can parse the apache access log file combined format ([day/month/year:hour:minute:second zone]).
*
* If the time in the apache log file is in a different format the correct PHP format needs to be entered here.
*/
// TO-DO: ask how to deal with timezone, do we need it?
public const PHP_DATETIME_FORMAT = 'd/M/Y:H:i:s O';
/**
* PHP format of the date (without time and timezone)
*/
public const PHP_DATE_FORMAT = 'd/M/Y';
/**
* Constructor.
*
* @param array $argv command-line arguments
*/
public function __construct(array $argv = [])
{
parent::__construct($argv);
if (count($this->argv) < 1 || count($this->argv) > 2) {
$this->usage();
exit(8);
}
// This tool needs egrep path configured.
if (file_exists(self::EGREP_PATH)) {
fwrite(STDERR, 'Error: This tool needs egrep program. Please define the constatn EGREP_PATH in this script, enter there the path to egrep command on your machine.' . PHP_EOL);
exit(9);
}
}
public function getLogFileDir(): string
{
return PKPStatisticsHelper::getUsageStatsDirPath() . '/tmp';
}
public function getParseRegex(): string
{
return self::PARSEREGEX;
}
public function getPhpDateTimeFormat(): string
{
return self::PHP_DATETIME_FORMAT;
}
public function isPathInfoDisabled(): bool
{
return self::PATH_INFO_DISABLED;
}
public function isApacheAccessLogFile(): bool
{
return true;
}
/**
* Print command usage information.
*/
public function usage()
{
echo "\nConvert the passed apache access log file into the new usage stats log file format.
This will copy the apache access file to the usageStats/tmp/ folder in the files directory,
filter entries related to this installation, split the file by day, rename the result file(s)
into apache_usage_events_YYYYMMDD.log, convert them into the new JSON format, and
copy them to usageStats/archive/ folder.
Must run under user with enough privilegies to read access apache log files.\n"
. " Usage: php {$this->scriptName} [path/to/apache/log/file.log]\n\n";
}
/**
* Create the temporary processing folder and call the function to process the log file.
*/
public function execute(): void
{
$fileMgr = new FileManager();
$filePath = current($this->argv);
if ($fileMgr->fileExists($this->getLogFileDir(), 'dir')) {
$fileMgr->rmtree($this->getLogFileDir());
}
if (!$fileMgr->mkdir($this->getLogFileDir())) {
fwrite(STDERR, "Error: Can't create folder " . $this->getLogFileDir() . PHP_EOL);
exit(10);
}
if ($fileMgr->fileExists($filePath)) {
$this->processAccessLogFile($filePath);
} else {
fwrite(STDERR, "Error: File {$filePath} don't exist or can't be accessed." . PHP_EOL);
exit(11);
}
// Do not remove tmp/ folder here -- it could be used by admins for checking and debugging
}
/**
* Process the access log file:
* copy it to the usageStats/tmp/ folder,
* filter entries related to this installation,
* split by day,
* convert into the new JSON format,
* copy to usageStats/archive/ folder.
*/
public function processAccessLogFile(string $filePath)
{
$copiedFilePath = $this->copyFile($filePath);
$filteredFilePath = $this->filterFile($copiedFilePath);
$dailyFiles = $this->splitFileByDay($filteredFilePath);
$fileMgr = new FileManager();
foreach ($dailyFiles as $dailyFile) {
$this->convert($dailyFile);
$this->archive($dailyFile);
if (pathinfo($filePath, PATHINFO_EXTENSION) == 'gz') {
$archiveFilePath = StatisticsHelper::getUsageStatsDirPath() . '/' . FileLoader::FILE_LOADER_PATH_ARCHIVE . '/' . $dailyFile;
$archiveFilePath = $fileMgr->gzCompressFile($archiveFilePath);
}
}
}
/**
* Copy acess log file to the folder usageStats/tmp/
*/
public function copyFile(string $filePath): string
{
$fileName = pathinfo($filePath, PATHINFO_BASENAME);
$tmpFilePath = "{$this->getLogFileDir()}/{$fileName}";
$fileMgr = new FileManager();
if (!$fileMgr->copyFile($filePath, $tmpFilePath)) {
fwrite(STDERR, "Could not copy file from {$filePath} to {$tmpFilePath}." . PHP_EOL);
exit(12);
}
echo "File {$filePath} copied to {$tmpFilePath}.\n";
return $tmpFilePath;
}
/**
* Filtering accell log file entries related to this installation, i.e.
* that contain existing context paths.
* Save the filtered entries into a new file with the ending _tmp.
*/
public function filterFile(string $filePath): string
{
$fileMgr = new FileManager();
if (pathinfo($filePath, PATHINFO_EXTENSION) == 'gz') {
try {
$filePath = $fileMgr->gzDecompressFile($filePath);
} catch (Exception $e) {
fwrite(STDERR, $e->getMessage() . PHP_EOL);
exit(13);
}
}
$filteredFilePath = $filePath . '_tmp';
$callback = fn (Context $context): string => $context->getPath();
$escapedContextPaths = implode('/|/', array_map('escapeshellarg', array_map($callback, $this->contextsByPath)));
$output = null;
$returnValue = 0;
exec(escapeshellarg(self::EGREP_PATH) . " -i '" . $escapedContextPaths . "' " . escapeshellarg($filePath) . ' > ' . escapeshellarg($filteredFilePath), $output, $returnValue);
if ($returnValue > 1) {
fwrite(STDERR, 'Error: the execution of ' . self::EGREP_PATH . ' is not possible.' . PHP_EOL);
exit(14);
}
clearstatcache();
if (filesize($filePath) == 0) {
fwrite(STDERR, 'Error: No entries found related to this installation.' . PHP_EOL);
exit(15);
}
return $filteredFilePath;
}
/**
* Split access log file by day. The new, daily files will be named to apache_usage_events_YYYYMMDD.log
*
* @return array List of daily access log files.
*/
public function splitFileByDay(string $filePath): array
{
// Get the first and the last date in the log file
$firstDate = $lastDate = null;
$splFileObject = new SplFileObject($filePath, 'r');
while (!$splFileObject->eof()) {
$line = $splFileObject->fgets();
if (preg_match(self::PARSEREGEX, $line, $m)) {
$firstDate = DateTime::createFromFormat(self::PHP_DATETIME_FORMAT, $m[2]);
break;
}
}
$splFileObject->seek(PHP_INT_MAX);
$lastLineNo = $splFileObject->key() + 1;
do {
$splFileObject->seek($lastLineNo);
$line = $splFileObject->current();
if (preg_match(self::PARSEREGEX, $line, $m)) {
$lastDate = DateTime::createFromFormat(self::PHP_DATETIME_FORMAT, $m[2]);
break;
}
$lastLineNo = $splFileObject->key() - 1;
} while ($lastLineNo > 0);
//explicitly assign null, so that the file can be deleted
$splFileObject = null;
if (is_null($firstDate) || is_null($lastDate)) {
fwrite(STDERR, 'Error: First or last date not found.' . PHP_EOL);
exit(16);
}
// Get all days between the first and the last date, including the last date
$period = new DatePeriod(
$firstDate,
new DateInterval('P1D'),
$lastDate
);
$dailyFiles = [];
foreach ($period as $key => $value) {
$day = $value->format('Ymd');
// Check if a converted apache file with the same day already exists in any of usageStats/ folders.
$existingApacheUsageEventsFiles = glob(PKPStatisticsHelper::getUsageStatsDirPath() . '/*/apache_usage_events_' . $day . '*');
$existingApacheUsageEventsFilesCount = count($existingApacheUsageEventsFiles) ? count($existingApacheUsageEventsFiles) : 0;
$countPartOfFileName = '';
if ($existingApacheUsageEventsFilesCount) {
$countPartOfFileName = "_{$existingApacheUsageEventsFilesCount}_";
fwrite(STDERR, "Warning: One or more files apache_usage_events_{$day}.log already exist. You will need to clean or merge them into one before reprocessing the statistics." . PHP_EOL);
}
$dailyFileName = 'apache_usage_events_' . $day . $countPartOfFileName . '.log';
$dayFilePath = $this->getLogFileDir() . '/' . $dailyFileName;
$output = null;
$returnValue = 0;
exec(escapeshellarg(self::EGREP_PATH) . " -i '" . preg_quote($value->format(self::PHP_DATE_FORMAT)) . "' " . escapeshellarg($filePath) . ' > ' . escapeshellarg($dayFilePath), $output, $returnValue);
if ($returnValue > 1) {
fwrite(STDERR, 'Error: Could not split file by day.' . PHP_EOL);
exit(17);
}
$dailyFiles[] = $dailyFileName;
echo "File {$dayFilePath} created.\n";
}
return $dailyFiles;
}
/**
* Copy the file from the folder usageStats/tmp/ into usageStats/archive/.
*/
public function archive(string $fileName): void
{
$tmpFilePath = "{$this->getLogFileDir()}/{$fileName}";
$archiveFilePath = StatisticsHelper::getUsageStatsDirPath() . '/' . FileLoader::FILE_LOADER_PATH_ARCHIVE . '/' . $fileName;
$fileMgr = new FileManager();
if (!$fileMgr->copyFile($tmpFilePath, $archiveFilePath)) {
fwrite(STDERR, "Error: Could not copy file from {$tmpFilePath} to {$archiveFilePath}." . PHP_EOL);
exit(18);
}
echo "File {$tmpFilePath} successfully archived to {$archiveFilePath}.\n";
}
/**
* Get the expected page and operation.
* They are grouped by the object type constant that
* they give access to.
*/
protected function getExpectedPageAndOp(): array
{
$pageAndOp = [
Application::getContextAssocType() => [
'index/index'
]
];
$application = Application::get();
$applicationName = $application->getName();
switch ($applicationName) {
case 'ojs2':
$pageAndOp = $pageAndOp + [
Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER => [
'article/downloadSuppFile'],
Application::ASSOC_TYPE_SUBMISSION_FILE => [
'article/download', 'article/viewFile'],
Application::ASSOC_TYPE_SUBMISSION => [
'article/view', 'article/viewArticle'],
Application::ASSOC_TYPE_ISSUE => [
'issue/view'],
Application::ASSOC_TYPE_ISSUE_GALLEY => [
'issue/download', 'issue/viewFile']
];
$pageAndOp[Application::getContextAssocType()][] = 'index';
break;
case 'omp':
$pageAndOp = $pageAndOp + [
Application::ASSOC_TYPE_SUBMISSION_FILE => [
'catalog/download'],
Application::ASSOC_TYPE_MONOGRAPH => [
'catalog/book'],
Application::ASSOC_TYPE_SERIES => [
'catalog/series']
];
$pageAndOp[Application::getContextAssocType()][] = 'catalog/index';
break;
case 'ops':
$pageAndOp = $pageAndOp + [
Application::ASSOC_TYPE_SUBMISSION_FILE => [
'preprint/download'],
Application::ASSOC_TYPE_SUBMISSION => [
'preprint/view']
];
$pageAndOp[Application::getContextAssocType()][] = 'index';
break;
default:
throw new Exception('Unrecognized application name.');
}
return $pageAndOp;
}
/**
* Set assoc type and IDs from the passed page, operation and arguments.
*/
protected function setAssoc(int $assocType, string $op, array $args, array &$newEntry): void
{
$application = Application::get();
$applicationName = $application->getName();
switch ($applicationName) {
case 'ojs2':
$this->setOJSAssoc($assocType, $args, $newEntry);
break;
case 'omp':
$this->setOMPAssoc($assocType, $args, $newEntry);
break;
case 'ops':
$this->setOPSAssoc($assocType, $args, $newEntry);
break;
default:
throw new Exception('Unrecognized application name!');
}
}
/**
* Set assoc type and IDs from the passed page, operation and
* arguments specific to OJS.
*/
protected function setOJSAssoc(int $assocType, array $args, array &$newEntry): void
{
switch ($assocType) {
case Application::getContextAssocType():
// $newEntry['contextId'] has already been set
$newEntry['assocType'] = $assocType;
break;
case Application::ASSOC_TYPE_SUBMISSION:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing submission ID URL parameter.' . PHP_EOL);
break;
}
$submission = Repo::submission()->getByBestId($args[0], $newEntry['contextId']);
if (!$submission) {
fwrite(STDERR, "Submission with the URL path or ID {$args[0]} does not exist in the journal with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$submissionId = $submission->getId();
// If it is an older submission version, the arguments must be:
// $submissionId/version/$publicationId.
// Consider also releases 2.x where log files can contain URL
// view/$submissionId/$representationId i.e. without $submissionFileId argument
// for other and remote galleys.
if (in_array('version', $args)) {
if ($args[1] !== 'version' || !isset($args[2])) {
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/version/<publicationId>.' . PHP_EOL);
break;
}
$publicationId = (int) $args[2];
if (!Repo::publication()->exists($publicationId, $submissionId)) {
fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
} elseif (count($args) == 2) {
// Consider usage stats log files from releases 2.x:
// The URL article/view/{$articleId}/{$galleyId} was used for assoc type galley.
// Only Other galleys will be considered here (s. file description above).
// Those should then be assoc type = submission file.
$representationUrlPath = $args[1];
$galley = $representationId = null;
if (ctype_digit((string) $representationUrlPath)) {
// assume it is ID and not the URL path
$representationId = (int) $representationUrlPath;
$galley = Repo::galley()->get($representationId);
} else {
// We need to get the publication in order to be able to get galley by URL path
// We cannot assume that this is the current publication,
// because the log entry can be long time ago, and
// since then there could be new submission versions created,
// so take the first publication and galley found with the given representationUrlPath.
// (Different publications can contain the same galley URL path.)
// It is not accurate but only possible.
$publications = $submission->getData('publications');
foreach ($publications as $publication) {
foreach ($publication->getData('galleys') as $publicationGalley) {
if ($publicationGalley->getBestGalleyId() == $representationUrlPath) {
$galley = $publicationGalley;
$representationId = $publicationGalley->getId();
break 2;
}
}
}
}
if (!isset($galley)) {
fwrite(STDERR, "Galley with the URL path {$representationUrlPath} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
$submissionFileId = $galley->getData('submissionFileId');
if (!isset($submissionFileId)) {
break;
}
$submissionFile = Repo::submissionFile()->get($submissionFileId, $submissionId);
if (!isset($submissionFile)) {
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
$fileType = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype'));
if ($fileType == StatisticsHelper::STATISTICS_FILE_TYPE_PDF || $fileType == StatisticsHelper::STATISTICS_FILE_TYPE_HTML) {
// Do not consider PDF and HTML file, the download URL will follow
break;
}
$newEntry['assocType'] = Application::ASSOC_TYPE_SUBMISSION_FILE;
$newEntry['submissionId'] = $submissionId;
$newEntry['representationId'] = $representationId;
$newEntry['submissionFileId'] = $submissionFileId;
$newEntry['fileType'] = $fileType;
break;
}
$newEntry['submissionId'] = $submissionId;
$newEntry['assocType'] = $assocType;
break;
case Application::ASSOC_TYPE_SUBMISSION_FILE:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing submission ID URL parameter.' . PHP_EOL);
break;
}
if (!isset($args[1])) {
fwrite(STDERR, 'Missing galley ID URL parameter.' . PHP_EOL);
break;
}
$submission = Repo::submission()->getByBestId($args[0], $newEntry['contextId']);
if (!$submission) {
fwrite(STDERR, "Submission with the URL path or ID {$args[0]} does not exist in the journal with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$submissionId = $submission->getId();
// If it is an older submission version, the arguments must be:
// $submissionId/version/$publicationId/$representationId/$submissionFileId.
// Consider also this issue: https://github.com/pkp/pkp-lib/issues/6573
// where apache log files can contain URL download/$submissionId/$representationId,
// i.e. without $submissionFileId argument.
// Also the URLs from releases 2.x will not have submissionFileId.
$publicationId = $submissionFileId = null; // do not necessarily exist
if (in_array('version', $args)) {
if ($args[1] !== 'version' || !isset($args[2]) || !isset($args[3])) {
// if version is there, there must be $publicationId and $representationId arguments
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/version/<publicationId>/<galleyId>/<fileId>.' . PHP_EOL);
break;
}
$publicationId = (int) $args[2];
$representationUrlPath = $args[3];
if (isset($args[4])) {
$submissionFileId = (int) $args[4];
}
} else {
$representationUrlPath = $args[1];
if (isset($args[2])) {
$submissionFileId = (int) $args[2];
}
}
// Find the galley and representation ID
$representationId = $galley = null;
if (ctype_digit((string) $representationUrlPath)) {
// assume it is ID and not the URL path
$representationId = (int) $representationUrlPath;
$galley = Repo::galley()->get($representationId);
if (!$galley) {
fwrite(STDERR, "Galley with the ID {$representationUrlPath} does not exist." . PHP_EOL);
break;
}
} else {
// We need to get the publication in order to be able to get galley by URL path
$publications = $submission->getData('publications');
if (isset($publicationId)) {
$publication = $publications->first(function ($value, $key) use ($publicationId) {
return $value->getId() == $publicationId;
});
if (!$publication) {
fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
$galley = Repo::galley()->getByUrlPath($representationUrlPath, $publication);
if (!$galley) {
fwrite(STDERR, "Galley with the URL path {$representationUrlPath} does not exist in the publication (submission version) with the ID {$publicationId}." . PHP_EOL);
break;
}
$representationId = $galley->getId();
} else {
// We cannot assume that this is the current publication,
// because the log entry can be long time ago, and
// since then there could be new submission versions created,
// so take the first publication and galley found with the given representationUrlPath.
// (Different publications can contain the same galley URL path.)
$possibleGalleys = [];
foreach ($publications as $publication) {
foreach ($publication->getData('galleys') as $publicationGalley) {
if ($publicationGalley->getBestGalleyId() == $representationUrlPath) {
$possibleGalleys[] = $publicationGalley;
if (isset($submissionFileId) && $publicationGalley->getData('submissionFileId') == $submissionFileId) {
$galley = $publicationGalley;
$representationId = $publicationGalley->getId();
break 2;
}
}
}
}
if (empty($possibleGalleys)) {
fwrite(STDERR, "Galley with the URL path {$representationUrlPath} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
// if no matching galley has been found yet, take the first possible
if (!isset($representationId)) {
$galley = $possibleGalleys[0];
$representationId = $galley->getId();
}
}
}
if (!$submissionFileId) {
$submissionFileId = $galley->getData('submissionFileId');
}
$submissionFile = Repo::submissionFile()->get($submissionFileId, $submissionId);
if (!$submissionFile) {
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
if ($galley->getData('submissionFileId') != $submissionFileId) {
// This check is e.g. when representation ID (and not URL path) and submissionFileId are given as arguments
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not belong to the galley with the ID {$representationId}." . PHP_EOL);
break;
}
// is this a full text or supp file
$genreDao = DAORegistry::getDAO('GenreDAO');
$genre = $genreDao->getById($submissionFile->getData('genreId'));
if ($genre->getCategory() != Genre::GENRE_CATEGORY_DOCUMENT || $genre->getSupplementary() || $genre->getDependent()) {
$newEntry['assocType'] = Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER;
} else {
$newEntry['assocType'] = $assocType;
}
$newEntry['submissionId'] = $submissionId;
$newEntry['representationId'] = $representationId;
$newEntry['submissionFileId'] = $submissionFileId;
$newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype'));
break;
case Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER:
// This is the URL article/downloadSuppFile/articleId/suppFileId from a 2.x log file
if (!isset($args[0])) {
fwrite(STDERR, 'Missing submission ID URL parameter.' . PHP_EOL);
break;
}
if (!isset($args[1])) {
fwrite(STDERR, 'Missing supp file ID URL parameter.' . PHP_EOL);
break;
}
$submission = Repo::submission()->getByBestId($args[0], $newEntry['contextId']);
if (!$submission) {
fwrite(STDERR, "Submission with the URL path or ID {$args[0]} does not exist in the journal with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$submissionId = $submission->getId();
$galley = $submissionFile = null;
$publications = $submission->getData('publications');
foreach ($publications as $publication) {
foreach ($publication->getData('galleys') as $possibleGalley) {
$possibleSubmissionFileId = $possibleGalley->getData('submissionFileId');
if ($possibleSubmissionFileId) { // it is not a remote supp file
$possibleSubmissionFile = Repo::submissionFile()->get($possibleSubmissionFileId, $submissionId);
if ($possibleSubmissionFile) {
if (ctype_digit((string) $args[1])) { // supp file ID
if ($possibleSubmissionFile->getData('old-supp-id') == $args[1]) {
// Galley and file found
$galley = $possibleGalley;
$submissionFile = $possibleSubmissionFile;
break 2;
}
} else { // supp file URL path
if ($possibleGalley->getData('urlPath') == $args[1]) {
// Galley and file found
$galley = $possibleGalley;
$submissionFile = $possibleSubmissionFile;
break 2;
}
}
}
}
}
}
if ($galley && $submissionFile) {
$newEntry['assocType'] = $assocType;
$newEntry['submissionId'] = $submissionId;
$newEntry['representationId'] = $galley->getId();
$newEntry['submissionFileId'] = $submissionFile->getId();
$newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype'));
} else {
fwrite(STDERR, 'Supp file could not be found.' . PHP_EOL);
}
break;
case Application::ASSOC_TYPE_ISSUE:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing issue ID URL parameter.' . PHP_EOL);
break;
}
// Consider issue https://github.com/pkp/pkp-lib/issues/6611
// where apache log files contain both URLs for issue galley download:
// issue/view/issueId/galleyId (that should not be considered here), as well as
// issue/download/issueId/galleyId (that is considered below)
if (count($args) != 1) {
break;
}
$issue = Repo::issue()->getByBestId($args[0], $newEntry['contextId']);
if (!$issue) {
fwrite(STDERR, "Issue with the URL path or ID {$args[0]} does not exist in the journal with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$issueId = $issue->getId();
$newEntry['issueId'] = $issueId;
$newEntry['assocType'] = $assocType;
break;
case Application::ASSOC_TYPE_ISSUE_GALLEY:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing issue ID URL parameter.' . PHP_EOL);
break;
}
if (!isset($args[1])) {
fwrite(STDERR, 'Missing issue galley ID URL parameter.' . PHP_EOL);
break;
}
$issue = Repo::issue()->getByBestId($args[0], $newEntry['contextId']);
if (!$issue) {
fwrite(STDERR, "Issue with the URL path or ID {$args[0]} does not exist in the journal with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$issueId = $issue->getId();
$issueGalleyDao = DAORegistry::getDAO('IssueGalleyDAO');
$issueGalley = $issueGalleyDao->getByBestId($args[1], $issueId);
if (!$issueGalley) {
fwrite(STDERR, "Issue galley with the URL path or ID {$args[1]} does not exist in the issue with the ID {$issueId}." . PHP_EOL);
break;
}
$newEntry['issueId'] = $issueId;
$newEntry['issueGalleyId'] = $issueGalley->getId();
$newEntry['assocType'] = $assocType;
break;
}
}
/**
* Set assoc type and IDs from the passed page, operation and
* arguments specific to OMP.
*/
protected function setOMPAssoc(int $assocType, array $args, array &$newEntry): void
{
switch ($assocType) {
case Application::getContextAssocType():
// $newEntry['contextId'] has already been set
$newEntry['assocType'] = $assocType;
break;
case Application::ASSOC_TYPE_SUBMISSION:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing submission ID URL parameter.' . PHP_EOL);
break;
}
$submission = Repo::submission()->getByBestId($args[0], $newEntry['contextId']);
if (!$submission) {
fwrite(STDERR, "Submission with the URL path or ID {$args[0]} does not exist in the press with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$submissionId = $submission->getId();
// If it is an older submission version, the arguments must be:
// $submissionId/version/$publicationId.
$publicationId = null;
if (in_array('version', $args)) {
if ($args[1] !== 'version' || !isset($args[2])) {
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/version/<publicationId>.' . PHP_EOL);
break;
}
$publicationId = (int) $args[2];
if (!Repo::publication()->exists($publicationId, $submissionId)) {
fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
}
// Is it a chapter landing page
$chapter = null;
if (in_array('chapter', $args)) {
if (isset($publicationId)) {
// The URL is $submissionId/version/$publicationId/chapter/$chapterId
if ($args[3] !== 'chapter' || !isset($args[4])) {
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/version/<publicationId>/chapter/<chapterId>.' . PHP_EOL);
break;
}
$chapterId = (int) $args[4];
} else {
// The URL is $submissionId/chapter/$chapterId
if ($args[1] !== 'chapter' || !isset($args[2])) {
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/chapter/<chapterId>.' . PHP_EOL);
break;
}
$chapterId = (int) $args[2];
}
$chapterDao = DAORegistry::getDAO('ChapterDAO'); /** @var ChapterDAO $chapterDao */
$chapter = $chapterDao->getChapter($chapterId);
if (!$chapter) {
fwrite(STDERR, "Chapter with the ID {$chapterId} does not exist." . PHP_EOL);
break;
}
}
$newEntry['submissionId'] = $submissionId;
$newEntry['assocType'] = isset($chapter) ? Application::ASSOC_TYPE_CHAPTER : $assocType;
$newEntry['chpaterId'] = isset($chapter) ? $chapter->getId() : null;
break;
case Application::ASSOC_TYPE_SUBMISSION_FILE:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing submission ID URL parameter.' . PHP_EOL);
break;
}
if (!isset($args[1])) {
fwrite(STDERR, 'Missing publication format ID URL parameter.' . PHP_EOL);
break;
}
if (!isset($args[2])) {
fwrite(STDERR, 'Missing file or publication ID URL parameter.' . PHP_EOL);
break;
}
$submission = Repo::submission()->getByBestId($args[0], $newEntry['contextId']);
if (!$submission) {
fwrite(STDERR, "Submission with the URL path or ID {$args[0]} does not exist in the press with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$submissionId = $submission->getId();
// If it is an older submission version, the arguments must be:
// $submissionId/version/$publicationId/$representationId/$submissionFileId.
$publicationId = null;
if (in_array('version', $args)) {
if ($args[1] !== 'version' || !isset($args[2]) || !isset($args[3])) {
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/version/<publicationId>/<publicationFormatId>/<fileId>.' . PHP_EOL);
break;
}
$publicationId = (int) $args[2];
$representationUrlPath = $args[3];
$submissionFileId = (int) $args[4];
} else {
$representationUrlPath = $args[1];
$submissionFileId = (int) $args[2];
}
$submissionFile = Repo::submissionFile()->get($submissionFileId, $submissionId);
if (!$submissionFile) {
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
if ($submissionFile->getData('assocType') != Application::ASSOC_TYPE_PUBLICATION_FORMAT) {
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not belong to a publication format." . PHP_EOL);
break;
}
$representationId = $submissionFile->getData('assocId');
$publicationFormatDao = DAORegistry::getDAO('PublicationFormatDAO'); /** @var PublicationFormatDAO $publicationFormatDao */
if (ctype_digit((string) $representationUrlPath)) {
// assume it is ID and not the URL path
if ($representationUrlPath != $representationId) {
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not belong to the publication format with ID {$representationUrlPath}." . PHP_EOL);
break;
}
$publicationFormat = $publicationFormatDao->getById($representationId, $publicationId);
if (!$publicationFormat) {
fwrite(STDERR, "Publication format with the ID {$representationUrlPath} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
} else {
// We need to get the publication in order to be able to get publication format by URL path
$publications = $submission->getData('publications');
if (isset($publicationId)) {
$publication = $publications->first(function ($value, $key) use ($publicationId) {
return $value->getId() == $publicationId;
});
if (!$publication) {
fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
$publicationFormat = $publicationFormatDao->getByBestId($representationUrlPath, $publication->getId());
if (!$publicationFormat) {
fwrite(STDERR, "Publication format with the URL path {$representationUrlPath} does not exist in the publication (submission version) with the ID {$publicationId}." . PHP_EOL);
break;
}
if ($representationId != $publicationFormat->getId()) {
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not exist in the publication (submission version) with the ID {$publicationId}." . PHP_EOL);
break;
}
} else {
// We cannot assume that this is the current publication,
// because the log entry can be long time ago, and
// since then there could be new submission versions created,
// so take the first publication found with
// publication format with the given representationUrlPath
// that contains the given submission file.
// (Different publications can contain the same publication format URL path.)
$publicationFormat = null;
foreach ($publications as $publication) {
foreach ($publication->getData('publicationFormats') as $possiblePublicationFormat) {
if ($possiblePublicationFormat->getBestId() == $representationUrlPath) {
if ($representationId == $possiblePublicationFormat->getId()) {
$publicationFormat = $possiblePublicationFormat;
break 2;
}
}
}
}
if (!$publicationFormat) {
fwrite(STDERR, "Publication format with the URL path {$representationUrlPath} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
}
}
// is this a full text or supp file
$genreDao = DAORegistry::getDAO('GenreDAO');
$genre = $genreDao->getById($submissionFile->getData('genreId'));
if ($genre->getCategory() != Genre::GENRE_CATEGORY_DOCUMENT || $genre->getSupplementary() || $genre->getDependent()) {
$newEntry['assocType'] = Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER;
} else {
$newEntry['assocType'] = $assocType;
}
$newEntry['submissionId'] = $submissionId;
$newEntry['representationId'] = $representationId;
$newEntry['submissionFileId'] = $submissionFileId;
$newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype'));
$newEntry['chapterId'] = $submissionFile->getData('chapterId');
break;
case Application::ASSOC_TYPE_SERIES:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing series path URL parameter.' . PHP_EOL);
break;
}
$seriesPath = $args[0];
$series = Repo::section()->getByPath($seriesPath, $newEntry['contextId']);
if (!$series) {
fwrite(STDERR, "Series with the path {$seriesPath} does not exist in the press with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$newEntry['seriesId'] = $series->getId();
$newEntry['assocType'] = $assocType;
break;
}
}
/**
* Set assoc type and IDs from the passed page, operation and
* arguments specific to OPS.
*/
protected function setOPSAssoc(int $assocType, array $args, array &$newEntry): void
{
switch ($assocType) {
case Application::getContextAssocType():
// $newEntry['contextId'] has already been set
$newEntry['assocType'] = $assocType;
break;
case Application::ASSOC_TYPE_SUBMISSION:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing submission ID URL parameter.' . PHP_EOL);
break;
}
$submission = Repo::submission()->getByBestId($args[0], $newEntry['contextId']);
if (!$submission) {
fwrite(STDERR, "Submission with the URL path or ID {$args[0]} does not exist in the server with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$submissionId = $submission->getId();
// If it is an older submission version, the arguments must be:
// $submissionId/version/$publicationId.
if (in_array('version', $args)) {
if ($args[1] !== 'version' || !isset($args[2])) {
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/version/<publicationId>.' . PHP_EOL);
break;
}
$publicationId = (int) $args[2];
if (!Repo::publication()->exists($publicationId, $submissionId)) {
fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
}
$newEntry['submissionId'] = $submissionId;
$newEntry['assocType'] = $assocType;
break;
case Application::ASSOC_TYPE_SUBMISSION_FILE:
if (!isset($args[0])) {
fwrite(STDERR, 'Missing submission ID URL parameter.' . PHP_EOL);
break;
}
if (!isset($args[1])) {
fwrite(STDERR, 'Missing galley ID URL parameter.' . PHP_EOL);
break;
}
if (!isset($args[2])) {
fwrite(STDERR, 'Missing file or publication ID URL parameter.' . PHP_EOL);
break;
}
$submission = Repo::submission()->getByBestId($args[0], $newEntry['contextId']);
if (!$submission) {
fwrite(STDERR, "Submission with the URL path or ID {$args[0]} does not exist in the server with the ID {$newEntry['contextId']}." . PHP_EOL);
break;
}
$submissionId = $submission->getId();
// If it is an older submission version, the arguments must be:
// $submissionId/version/$publicationId/$representationId/$submissionFileId.
$publicationId = null;
if (in_array('version', $args)) {
if ($args[1] !== 'version' || !isset($args[2]) || !isset($args[3])) {
fwrite(STDERR, 'The following arguments are expected and not found: <submissionId>/version/<publicationId>/<galleyId>/<fileId>.' . PHP_EOL);
break;
}
$publicationId = (int) $args[2];
$representationUrlPath = $args[3];
$submissionFileId = (int) $args[4];
} else {
$representationUrlPath = $args[1];
$submissionFileId = (int) $args[2];
}
$submissionFile = Repo::submissionFile()->get($submissionFileId, $submissionId);
if (!$submissionFile) {
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
// Find the galley and representation ID
$representationId = $galley = null;
if (ctype_digit((string) $representationUrlPath)) {
// assume it is ID and not the URL path
$representationId = (int) $representationUrlPath;
$galley = Repo::galley()->get($representationId);
if (!$galley) {
fwrite(STDERR, "Galley with the ID {$representationUrlPath} does not exist." . PHP_EOL);
break;
}
} else {
// We need to get the publication in order to be able to get galley by URL path
$publications = $submission->getData('publications');
if (isset($publicationId)) {
$publication = $publications->first(function ($value, $key) use ($publicationId) {
return $value->getId() == $publicationId;
});
if (!$publication) {
fwrite(STDERR, "Publication (submission version) with the ID {$publicationId} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
$galley = Repo::galley()->getByUrlPath($representationUrlPath, $publication);
if (!$galley) {
fwrite(STDERR, "Galley with the URL path {$representationUrlPath} does not exist in the publication (submission version) with the ID {$publicationId}." . PHP_EOL);
break;
}
$representationId = $galley->getId();
} else {
// We cannot assume that this is the current publication,
// because the log entry can be long time ago, and
// since then there could be new submission versions created,
// so take the first publication found with
// galley with the given representationUrlPath
// that contain the given submission file.
// (Different publications can contain the same galley URL path.)
foreach ($publications as $publication) {
foreach ($publication->getData('galleys') as $publicationGalley) {
if ($publicationGalley->getBestGalleyId() == $representationUrlPath) {
if ($publicationGalley->getData('submissionFileId') == $submissionFileId) {
$galley = $publicationGalley;
$representationId = $publicationGalley->getId();
break 2;
}
}
}
}
if (!$representationId) {
fwrite(STDERR, "Galley with the URL path {$representationUrlPath} does not exist in the submission with the ID {$submissionId}." . PHP_EOL);
break;
}
}
}
if ($galley->getData('submissionFileId') != $submissionFileId) {
// This check is e.g. when representation ID (and not URL path) and submissionFileId are given as arguments
fwrite(STDERR, "Submission file with the ID {$submissionFileId} does not belong to the galley with the ID {$representationId}." . PHP_EOL);
break;
}
// is this a full text or supp file
$genreDao = DAORegistry::getDAO('GenreDAO');
$genre = $genreDao->getById($submissionFile->getData('genreId'));
if ($genre->getCategory() != Genre::GENRE_CATEGORY_DOCUMENT || $genre->getSupplementary() || $genre->getDependent()) {
$newEntry['assocType'] = Application::ASSOC_TYPE_SUBMISSION_FILE_COUNTER_OTHER;
} else {
$newEntry['assocType'] = $assocType;
}
$newEntry['submissionId'] = $submissionId;
$newEntry['representationId'] = $representationId;
$newEntry['submissionFileId'] = $submissionFileId;
$newEntry['fileType'] = StatisticsHelper::getDocumentType($submissionFile->getData('mimetype'));
break;
}
}
}
$tool = new ConvertApacheAccessLogFile($argv ?? []);
$tool->execute();
|