From ebe1559a351ece67605567f6d8f14ae339347917 Mon Sep 17 00:00:00 2001 From: dre0059 <eliska.dreveniakova@vsb.cz> Date: Fri, 14 Feb 2025 15:37:28 +0100 Subject: [PATCH] The very first commit --- .gitattributes | 2 + .gitignore | 33 +++ .mvn/wrapper/maven-wrapper.properties | 19 ++ ...e_safety_immunology_virolog-references.txt | 1 + ...ocalTemp\302\263661.263675-references.txt" | 1 + mvnw | 259 ++++++++++++++++++ mvnw.cmd | 149 ++++++++++ pdf_database.db | 0 pom.xml | 175 ++++++++++++ .../ArticleProcessorApplication.java | 13 + .../articleprocessor/GrobidClient.java | 44 +++ .../articleprocessor/apacheTika/PDFbox.java | 40 +++ .../apacheTika/ReferencesScanner.java | 177 ++++++++++++ .../controller/FileUploadController.java | 105 +++++++ .../articleprocessor/model/Author.java | 36 +++ .../articleprocessor/model/Document.java | 62 +++++ .../model/DocumentMetadata.java | 37 +++ .../articleprocessor/model/Reference.java | 41 +++ .../repository/AuthorRepository.java | 12 + .../repository/DocumentRepository.java | 15 + .../repository/ReferenceRepository.java | 9 + .../service/DocumentService.java | 4 + .../service/MetadataParser.java | 70 +++++ .../service/ReferenceService.java | 4 + .../articleprocessor/service/TEIparser.java | 95 +++++++ src/main/resources/application.properties | 44 +++ .../resources/templates/upload-success.html | 88 ++++++ src/main/resources/templates/upload.html | 83 ++++++ .../ArticleProcessorApplicationTests.java | 13 + 29 files changed, 1631 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 .mvn/wrapper/maven-wrapper.properties create mode 100644 UserselidrAppDataLocalTempA_study_of_the_safety_immunology_virolog-references.txt create mode 100644 "UserselidrAppDataLocalTemp\302\263661.263675-references.txt" create mode 100644 mvnw create mode 100644 mvnw.cmd create mode 100644 pdf_database.db create mode 100644 pom.xml create mode 100644 src/main/java/com/dre0059/articleprocessor/ArticleProcessorApplication.java create mode 100644 src/main/java/com/dre0059/articleprocessor/GrobidClient.java create mode 100644 src/main/java/com/dre0059/articleprocessor/apacheTika/PDFbox.java create mode 100644 src/main/java/com/dre0059/articleprocessor/apacheTika/ReferencesScanner.java create mode 100644 src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java create mode 100644 src/main/java/com/dre0059/articleprocessor/model/Author.java create mode 100644 src/main/java/com/dre0059/articleprocessor/model/Document.java create mode 100644 src/main/java/com/dre0059/articleprocessor/model/DocumentMetadata.java create mode 100644 src/main/java/com/dre0059/articleprocessor/model/Reference.java create mode 100644 src/main/java/com/dre0059/articleprocessor/repository/AuthorRepository.java create mode 100644 src/main/java/com/dre0059/articleprocessor/repository/DocumentRepository.java create mode 100644 src/main/java/com/dre0059/articleprocessor/repository/ReferenceRepository.java create mode 100644 src/main/java/com/dre0059/articleprocessor/service/DocumentService.java create mode 100644 src/main/java/com/dre0059/articleprocessor/service/MetadataParser.java create mode 100644 src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java create mode 100644 src/main/java/com/dre0059/articleprocessor/service/TEIparser.java create mode 100644 src/main/resources/application.properties create mode 100644 src/main/resources/templates/upload-success.html create mode 100644 src/main/resources/templates/upload.html create mode 100644 src/test/java/com/dre0059/articleprocessor/ArticleProcessorApplicationTests.java diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3b41682 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +/mvnw text eol=lf +*.cmd text eol=crlf diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..549e00a --- /dev/null +++ b/.gitignore @@ -0,0 +1,33 @@ +HELP.md +target/ +!.mvn/wrapper/maven-wrapper.jar +!**/src/main/**/target/ +!**/src/test/**/target/ + +### STS ### +.apt_generated +.classpath +.factorypath +.project +.settings +.springBeans +.sts4-cache + +### IntelliJ IDEA ### +.idea +*.iws +*.iml +*.ipr + +### NetBeans ### +/nbproject/private/ +/nbbuild/ +/dist/ +/nbdist/ +/.nb-gradle/ +build/ +!**/src/main/**/build/ +!**/src/test/**/build/ + +### VS Code ### +.vscode/ diff --git a/.mvn/wrapper/maven-wrapper.properties b/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 0000000..d58dfb7 --- /dev/null +++ b/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,19 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +wrapperVersion=3.3.2 +distributionType=only-script +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.9.9/apache-maven-3.9.9-bin.zip diff --git a/UserselidrAppDataLocalTempA_study_of_the_safety_immunology_virolog-references.txt b/UserselidrAppDataLocalTempA_study_of_the_safety_immunology_virolog-references.txt new file mode 100644 index 0000000..e4e16e4 --- /dev/null +++ b/UserselidrAppDataLocalTempA_study_of_the_safety_immunology_virolog-references.txt @@ -0,0 +1 @@ +"hahahahahahaha" \ No newline at end of file diff --git "a/UserselidrAppDataLocalTemp\302\263661.263675-references.txt" "b/UserselidrAppDataLocalTemp\302\263661.263675-references.txt" new file mode 100644 index 0000000..83c2742 --- /dev/null +++ "b/UserselidrAppDataLocalTemp\302\263661.263675-references.txt" @@ -0,0 +1 @@ +"heeeeeeeeeeeeeeeeeeej" \ No newline at end of file diff --git a/mvnw b/mvnw new file mode 100644 index 0000000..19529dd --- /dev/null +++ b/mvnw @@ -0,0 +1,259 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Apache Maven Wrapper startup batch script, version 3.3.2 +# +# Optional ENV vars +# ----------------- +# JAVA_HOME - location of a JDK home dir, required when download maven via java source +# MVNW_REPOURL - repo url base for downloading maven distribution +# MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven +# MVNW_VERBOSE - true: enable verbose log; debug: trace the mvnw script; others: silence the output +# ---------------------------------------------------------------------------- + +set -euf +[ "${MVNW_VERBOSE-}" != debug ] || set -x + +# OS specific support. +native_path() { printf %s\\n "$1"; } +case "$(uname)" in +CYGWIN* | MINGW*) + [ -z "${JAVA_HOME-}" ] || JAVA_HOME="$(cygpath --unix "$JAVA_HOME")" + native_path() { cygpath --path --windows "$1"; } + ;; +esac + +# set JAVACMD and JAVACCMD +set_java_home() { + # For Cygwin and MinGW, ensure paths are in Unix format before anything is touched + if [ -n "${JAVA_HOME-}" ]; then + if [ -x "$JAVA_HOME/jre/sh/java" ]; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACCMD="$JAVA_HOME/jre/sh/javac" + else + JAVACMD="$JAVA_HOME/bin/java" + JAVACCMD="$JAVA_HOME/bin/javac" + + if [ ! -x "$JAVACMD" ] || [ ! -x "$JAVACCMD" ]; then + echo "The JAVA_HOME environment variable is not defined correctly, so mvnw cannot run." >&2 + echo "JAVA_HOME is set to \"$JAVA_HOME\", but \"\$JAVA_HOME/bin/java\" or \"\$JAVA_HOME/bin/javac\" does not exist." >&2 + return 1 + fi + fi + else + JAVACMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v java + )" || : + JAVACCMD="$( + 'set' +e + 'unset' -f command 2>/dev/null + 'command' -v javac + )" || : + + if [ ! -x "${JAVACMD-}" ] || [ ! -x "${JAVACCMD-}" ]; then + echo "The java/javac command does not exist in PATH nor is JAVA_HOME set, so mvnw cannot run." >&2 + return 1 + fi + fi +} + +# hash string like Java String::hashCode +hash_string() { + str="${1:-}" h=0 + while [ -n "$str" ]; do + char="${str%"${str#?}"}" + h=$(((h * 31 + $(LC_CTYPE=C printf %d "'$char")) % 4294967296)) + str="${str#?}" + done + printf %x\\n $h +} + +verbose() { :; } +[ "${MVNW_VERBOSE-}" != true ] || verbose() { printf %s\\n "${1-}"; } + +die() { + printf %s\\n "$1" >&2 + exit 1 +} + +trim() { + # MWRAPPER-139: + # Trims trailing and leading whitespace, carriage returns, tabs, and linefeeds. + # Needed for removing poorly interpreted newline sequences when running in more + # exotic environments such as mingw bash on Windows. + printf "%s" "${1}" | tr -d '[:space:]' +} + +# parse distributionUrl and optional distributionSha256Sum, requires .mvn/wrapper/maven-wrapper.properties +while IFS="=" read -r key value; do + case "${key-}" in + distributionUrl) distributionUrl=$(trim "${value-}") ;; + distributionSha256Sum) distributionSha256Sum=$(trim "${value-}") ;; + esac +done <"${0%/*}/.mvn/wrapper/maven-wrapper.properties" +[ -n "${distributionUrl-}" ] || die "cannot read distributionUrl property in ${0%/*}/.mvn/wrapper/maven-wrapper.properties" + +case "${distributionUrl##*/}" in +maven-mvnd-*bin.*) + MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ + case "${PROCESSOR_ARCHITECTURE-}${PROCESSOR_ARCHITEW6432-}:$(uname -a)" in + *AMD64:CYGWIN* | *AMD64:MINGW*) distributionPlatform=windows-amd64 ;; + :Darwin*x86_64) distributionPlatform=darwin-amd64 ;; + :Darwin*arm64) distributionPlatform=darwin-aarch64 ;; + :Linux*x86_64*) distributionPlatform=linux-amd64 ;; + *) + echo "Cannot detect native platform for mvnd on $(uname)-$(uname -m), use pure java version" >&2 + distributionPlatform=linux-amd64 + ;; + esac + distributionUrl="${distributionUrl%-bin.*}-$distributionPlatform.zip" + ;; +maven-mvnd-*) MVN_CMD=mvnd.sh _MVNW_REPO_PATTERN=/maven/mvnd/ ;; +*) MVN_CMD="mvn${0##*/mvnw}" _MVNW_REPO_PATTERN=/org/apache/maven/ ;; +esac + +# apply MVNW_REPOURL and calculate MAVEN_HOME +# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash> +[ -z "${MVNW_REPOURL-}" ] || distributionUrl="$MVNW_REPOURL$_MVNW_REPO_PATTERN${distributionUrl#*"$_MVNW_REPO_PATTERN"}" +distributionUrlName="${distributionUrl##*/}" +distributionUrlNameMain="${distributionUrlName%.*}" +distributionUrlNameMain="${distributionUrlNameMain%-bin}" +MAVEN_USER_HOME="${MAVEN_USER_HOME:-${HOME}/.m2}" +MAVEN_HOME="${MAVEN_USER_HOME}/wrapper/dists/${distributionUrlNameMain-}/$(hash_string "$distributionUrl")" + +exec_maven() { + unset MVNW_VERBOSE MVNW_USERNAME MVNW_PASSWORD MVNW_REPOURL || : + exec "$MAVEN_HOME/bin/$MVN_CMD" "$@" || die "cannot exec $MAVEN_HOME/bin/$MVN_CMD" +} + +if [ -d "$MAVEN_HOME" ]; then + verbose "found existing MAVEN_HOME at $MAVEN_HOME" + exec_maven "$@" +fi + +case "${distributionUrl-}" in +*?-bin.zip | *?maven-mvnd-?*-?*.zip) ;; +*) die "distributionUrl is not valid, must match *-bin.zip or maven-mvnd-*.zip, but found '${distributionUrl-}'" ;; +esac + +# prepare tmp dir +if TMP_DOWNLOAD_DIR="$(mktemp -d)" && [ -d "$TMP_DOWNLOAD_DIR" ]; then + clean() { rm -rf -- "$TMP_DOWNLOAD_DIR"; } + trap clean HUP INT TERM EXIT +else + die "cannot create temp dir" +fi + +mkdir -p -- "${MAVEN_HOME%/*}" + +# Download and Install Apache Maven +verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." +verbose "Downloading from: $distributionUrl" +verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" + +# select .zip or .tar.gz +if ! command -v unzip >/dev/null; then + distributionUrl="${distributionUrl%.zip}.tar.gz" + distributionUrlName="${distributionUrl##*/}" +fi + +# verbose opt +__MVNW_QUIET_WGET=--quiet __MVNW_QUIET_CURL=--silent __MVNW_QUIET_UNZIP=-q __MVNW_QUIET_TAR='' +[ "${MVNW_VERBOSE-}" != true ] || __MVNW_QUIET_WGET='' __MVNW_QUIET_CURL='' __MVNW_QUIET_UNZIP='' __MVNW_QUIET_TAR=v + +# normalize http auth +case "${MVNW_PASSWORD:+has-password}" in +'') MVNW_USERNAME='' MVNW_PASSWORD='' ;; +has-password) [ -n "${MVNW_USERNAME-}" ] || MVNW_USERNAME='' MVNW_PASSWORD='' ;; +esac + +if [ -z "${MVNW_USERNAME-}" ] && command -v wget >/dev/null; then + verbose "Found wget ... using wget" + wget ${__MVNW_QUIET_WGET:+"$__MVNW_QUIET_WGET"} "$distributionUrl" -O "$TMP_DOWNLOAD_DIR/$distributionUrlName" || die "wget: Failed to fetch $distributionUrl" +elif [ -z "${MVNW_USERNAME-}" ] && command -v curl >/dev/null; then + verbose "Found curl ... using curl" + curl ${__MVNW_QUIET_CURL:+"$__MVNW_QUIET_CURL"} -f -L -o "$TMP_DOWNLOAD_DIR/$distributionUrlName" "$distributionUrl" || die "curl: Failed to fetch $distributionUrl" +elif set_java_home; then + verbose "Falling back to use Java to download" + javaSource="$TMP_DOWNLOAD_DIR/Downloader.java" + targetZip="$TMP_DOWNLOAD_DIR/$distributionUrlName" + cat >"$javaSource" <<-END + public class Downloader extends java.net.Authenticator + { + protected java.net.PasswordAuthentication getPasswordAuthentication() + { + return new java.net.PasswordAuthentication( System.getenv( "MVNW_USERNAME" ), System.getenv( "MVNW_PASSWORD" ).toCharArray() ); + } + public static void main( String[] args ) throws Exception + { + setDefault( new Downloader() ); + java.nio.file.Files.copy( java.net.URI.create( args[0] ).toURL().openStream(), java.nio.file.Paths.get( args[1] ).toAbsolutePath().normalize() ); + } + } + END + # For Cygwin/MinGW, switch paths to Windows format before running javac and java + verbose " - Compiling Downloader.java ..." + "$(native_path "$JAVACCMD")" "$(native_path "$javaSource")" || die "Failed to compile Downloader.java" + verbose " - Running Downloader.java ..." + "$(native_path "$JAVACMD")" -cp "$(native_path "$TMP_DOWNLOAD_DIR")" Downloader "$distributionUrl" "$(native_path "$targetZip")" +fi + +# If specified, validate the SHA-256 sum of the Maven distribution zip file +if [ -n "${distributionSha256Sum-}" ]; then + distributionSha256Result=false + if [ "$MVN_CMD" = mvnd.sh ]; then + echo "Checksum validation is not supported for maven-mvnd." >&2 + echo "Please disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + elif command -v sha256sum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | sha256sum -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + elif command -v shasum >/dev/null; then + if echo "$distributionSha256Sum $TMP_DOWNLOAD_DIR/$distributionUrlName" | shasum -a 256 -c >/dev/null 2>&1; then + distributionSha256Result=true + fi + else + echo "Checksum validation was requested but neither 'sha256sum' or 'shasum' are available." >&2 + echo "Please install either command, or disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." >&2 + exit 1 + fi + if [ $distributionSha256Result = false ]; then + echo "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised." >&2 + echo "If you updated your Maven version, you need to update the specified distributionSha256Sum property." >&2 + exit 1 + fi +fi + +# unzip and move +if command -v unzip >/dev/null; then + unzip ${__MVNW_QUIET_UNZIP:+"$__MVNW_QUIET_UNZIP"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -d "$TMP_DOWNLOAD_DIR" || die "failed to unzip" +else + tar xzf${__MVNW_QUIET_TAR:+"$__MVNW_QUIET_TAR"} "$TMP_DOWNLOAD_DIR/$distributionUrlName" -C "$TMP_DOWNLOAD_DIR" || die "failed to untar" +fi +printf %s\\n "$distributionUrl" >"$TMP_DOWNLOAD_DIR/$distributionUrlNameMain/mvnw.url" +mv -- "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" "$MAVEN_HOME" || [ -d "$MAVEN_HOME" ] || die "fail to move MAVEN_HOME" + +clean || : +exec_maven "$@" diff --git a/mvnw.cmd b/mvnw.cmd new file mode 100644 index 0000000..249bdf3 --- /dev/null +++ b/mvnw.cmd @@ -0,0 +1,149 @@ +<# : batch portion +@REM ---------------------------------------------------------------------------- +@REM Licensed to the Apache Software Foundation (ASF) under one +@REM or more contributor license agreements. See the NOTICE file +@REM distributed with this work for additional information +@REM regarding copyright ownership. The ASF licenses this file +@REM to you under the Apache License, Version 2.0 (the +@REM "License"); you may not use this file except in compliance +@REM with the License. You may obtain a copy of the License at +@REM +@REM http://www.apache.org/licenses/LICENSE-2.0 +@REM +@REM Unless required by applicable law or agreed to in writing, +@REM software distributed under the License is distributed on an +@REM "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +@REM KIND, either express or implied. See the License for the +@REM specific language governing permissions and limitations +@REM under the License. +@REM ---------------------------------------------------------------------------- + +@REM ---------------------------------------------------------------------------- +@REM Apache Maven Wrapper startup batch script, version 3.3.2 +@REM +@REM Optional ENV vars +@REM MVNW_REPOURL - repo url base for downloading maven distribution +@REM MVNW_USERNAME/MVNW_PASSWORD - user and password for downloading maven +@REM MVNW_VERBOSE - true: enable verbose log; others: silence the output +@REM ---------------------------------------------------------------------------- + +@IF "%__MVNW_ARG0_NAME__%"=="" (SET __MVNW_ARG0_NAME__=%~nx0) +@SET __MVNW_CMD__= +@SET __MVNW_ERROR__= +@SET __MVNW_PSMODULEP_SAVE=%PSModulePath% +@SET PSModulePath= +@FOR /F "usebackq tokens=1* delims==" %%A IN (`powershell -noprofile "& {$scriptDir='%~dp0'; $script='%__MVNW_ARG0_NAME__%'; icm -ScriptBlock ([Scriptblock]::Create((Get-Content -Raw '%~f0'))) -NoNewScope}"`) DO @( + IF "%%A"=="MVN_CMD" (set __MVNW_CMD__=%%B) ELSE IF "%%B"=="" (echo %%A) ELSE (echo %%A=%%B) +) +@SET PSModulePath=%__MVNW_PSMODULEP_SAVE% +@SET __MVNW_PSMODULEP_SAVE= +@SET __MVNW_ARG0_NAME__= +@SET MVNW_USERNAME= +@SET MVNW_PASSWORD= +@IF NOT "%__MVNW_CMD__%"=="" (%__MVNW_CMD__% %*) +@echo Cannot start maven from wrapper >&2 && exit /b 1 +@GOTO :EOF +: end batch / begin powershell #> + +$ErrorActionPreference = "Stop" +if ($env:MVNW_VERBOSE -eq "true") { + $VerbosePreference = "Continue" +} + +# calculate distributionUrl, requires .mvn/wrapper/maven-wrapper.properties +$distributionUrl = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionUrl +if (!$distributionUrl) { + Write-Error "cannot read distributionUrl property in $scriptDir/.mvn/wrapper/maven-wrapper.properties" +} + +switch -wildcard -casesensitive ( $($distributionUrl -replace '^.*/','') ) { + "maven-mvnd-*" { + $USE_MVND = $true + $distributionUrl = $distributionUrl -replace '-bin\.[^.]*$',"-windows-amd64.zip" + $MVN_CMD = "mvnd.cmd" + break + } + default { + $USE_MVND = $false + $MVN_CMD = $script -replace '^mvnw','mvn' + break + } +} + +# apply MVNW_REPOURL and calculate MAVEN_HOME +# maven home pattern: ~/.m2/wrapper/dists/{apache-maven-<version>,maven-mvnd-<version>-<platform>}/<hash> +if ($env:MVNW_REPOURL) { + $MVNW_REPO_PATTERN = if ($USE_MVND) { "/org/apache/maven/" } else { "/maven/mvnd/" } + $distributionUrl = "$env:MVNW_REPOURL$MVNW_REPO_PATTERN$($distributionUrl -replace '^.*'+$MVNW_REPO_PATTERN,'')" +} +$distributionUrlName = $distributionUrl -replace '^.*/','' +$distributionUrlNameMain = $distributionUrlName -replace '\.[^.]*$','' -replace '-bin$','' +$MAVEN_HOME_PARENT = "$HOME/.m2/wrapper/dists/$distributionUrlNameMain" +if ($env:MAVEN_USER_HOME) { + $MAVEN_HOME_PARENT = "$env:MAVEN_USER_HOME/wrapper/dists/$distributionUrlNameMain" +} +$MAVEN_HOME_NAME = ([System.Security.Cryptography.MD5]::Create().ComputeHash([byte[]][char[]]$distributionUrl) | ForEach-Object {$_.ToString("x2")}) -join '' +$MAVEN_HOME = "$MAVEN_HOME_PARENT/$MAVEN_HOME_NAME" + +if (Test-Path -Path "$MAVEN_HOME" -PathType Container) { + Write-Verbose "found existing MAVEN_HOME at $MAVEN_HOME" + Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD" + exit $? +} + +if (! $distributionUrlNameMain -or ($distributionUrlName -eq $distributionUrlNameMain)) { + Write-Error "distributionUrl is not valid, must end with *-bin.zip, but found $distributionUrl" +} + +# prepare tmp dir +$TMP_DOWNLOAD_DIR_HOLDER = New-TemporaryFile +$TMP_DOWNLOAD_DIR = New-Item -Itemtype Directory -Path "$TMP_DOWNLOAD_DIR_HOLDER.dir" +$TMP_DOWNLOAD_DIR_HOLDER.Delete() | Out-Null +trap { + if ($TMP_DOWNLOAD_DIR.Exists) { + try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null } + catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" } + } +} + +New-Item -Itemtype Directory -Path "$MAVEN_HOME_PARENT" -Force | Out-Null + +# Download and Install Apache Maven +Write-Verbose "Couldn't find MAVEN_HOME, downloading and installing it ..." +Write-Verbose "Downloading from: $distributionUrl" +Write-Verbose "Downloading to: $TMP_DOWNLOAD_DIR/$distributionUrlName" + +$webclient = New-Object System.Net.WebClient +if ($env:MVNW_USERNAME -and $env:MVNW_PASSWORD) { + $webclient.Credentials = New-Object System.Net.NetworkCredential($env:MVNW_USERNAME, $env:MVNW_PASSWORD) +} +[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12 +$webclient.DownloadFile($distributionUrl, "$TMP_DOWNLOAD_DIR/$distributionUrlName") | Out-Null + +# If specified, validate the SHA-256 sum of the Maven distribution zip file +$distributionSha256Sum = (Get-Content -Raw "$scriptDir/.mvn/wrapper/maven-wrapper.properties" | ConvertFrom-StringData).distributionSha256Sum +if ($distributionSha256Sum) { + if ($USE_MVND) { + Write-Error "Checksum validation is not supported for maven-mvnd. `nPlease disable validation by removing 'distributionSha256Sum' from your maven-wrapper.properties." + } + Import-Module $PSHOME\Modules\Microsoft.PowerShell.Utility -Function Get-FileHash + if ((Get-FileHash "$TMP_DOWNLOAD_DIR/$distributionUrlName" -Algorithm SHA256).Hash.ToLower() -ne $distributionSha256Sum) { + Write-Error "Error: Failed to validate Maven distribution SHA-256, your Maven distribution might be compromised. If you updated your Maven version, you need to update the specified distributionSha256Sum property." + } +} + +# unzip and move +Expand-Archive "$TMP_DOWNLOAD_DIR/$distributionUrlName" -DestinationPath "$TMP_DOWNLOAD_DIR" | Out-Null +Rename-Item -Path "$TMP_DOWNLOAD_DIR/$distributionUrlNameMain" -NewName $MAVEN_HOME_NAME | Out-Null +try { + Move-Item -Path "$TMP_DOWNLOAD_DIR/$MAVEN_HOME_NAME" -Destination $MAVEN_HOME_PARENT | Out-Null +} catch { + if (! (Test-Path -Path "$MAVEN_HOME" -PathType Container)) { + Write-Error "fail to move MAVEN_HOME" + } +} finally { + try { Remove-Item $TMP_DOWNLOAD_DIR -Recurse -Force | Out-Null } + catch { Write-Warning "Cannot remove $TMP_DOWNLOAD_DIR" } +} + +Write-Output "MVN_CMD=$MAVEN_HOME/bin/$MVN_CMD" diff --git a/pdf_database.db b/pdf_database.db new file mode 100644 index 0000000..e69de29 diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..b83ae51 --- /dev/null +++ b/pom.xml @@ -0,0 +1,175 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 https://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-parent</artifactId> + <version>3.4.0</version> + <relativePath/> <!-- lookup parent from repository --> + </parent> + <groupId>com.dre0059</groupId> + <artifactId>articleProcessor</artifactId> + <version>0.0.1-SNAPSHOT</version> + <name>articleProcessor</name> + <description>articleProcessor</description> + <url/> + <licenses> + <license/> + </licenses> + <developers> + <developer/> + </developers> + <scm> + <connection/> + <developerConnection/> + <tag/> + <url/> + </scm> + <properties> + <java.version>17</java.version> + </properties> + + <!-- --> + <repositories> + <repository> + <id>grobid</id> + <name>GROBID DIY repo</name> + <url>https://grobid.s3.eu-west-1.amazonaws.com/repo/</url> + </repository> + </repositories> + + <dependencies> + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-data-jpa</artifactId> + </dependency> + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-web</artifactId> + </dependency> + + <!--<dependency> + <groupId>com.mysql</groupId> + <artifactId>mysql-connector-j</artifactId> + <scope>runtime</scope> + </dependency> + + <dependency> + <groupId>org.xerial</groupId> + <artifactId>sqlite-jdbc</artifactId> + <version>3.43.2.1</version> + </dependency> + + <dependency> + <groupId>org.hibernate.orm</groupId> + <artifactId>hibernate-community-dialects</artifactId> + <version>6.2.12.Final</version> + </dependency> + --> + + <dependency> + <groupId>com.h2database</groupId> + <artifactId>h2</artifactId> + <scope>runtime</scope> + </dependency> + + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-webflux</artifactId> + </dependency> + + + + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-test</artifactId> + <scope>test</scope> + </dependency> + + <!-- APACHE TIKA --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>2.9.2</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>2.9.2</version> + <type>pom</type> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-pdf-module</artifactId> + <version>2.9.2</version> + </dependency> + + <!-- PDF BOX --> + <dependency> + <groupId>org.apache.pdfbox</groupId> + <artifactId>pdfbox</artifactId> + <version>2.0.27</version> <!-- MĂ´Ĺľete skontrolovaĹĄ najnovšiu verziu --> + </dependency> + + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <version>2.23.1</version> <!-- zadejte konkrĂ©tnĂ verzi, napĹ™. 2.19.0 --> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + <version>2.23.1</version> + </dependency> + + <!-- OCR TESSERACT --> + <dependency> + <groupId>net.sourceforge.tess4j</groupId> + <artifactId>tess4j</artifactId> + <version>4.3.0</version> + </dependency> + <!-- OCR - OptickĂ© rozpoznávanie znakov --> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers-standard-package</artifactId> + <version>2.4.1</version> + </dependency> + + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-ocr-module</artifactId> + <version>2.9.2</version> + </dependency> + + <dependency> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-starter-thymeleaf</artifactId> + </dependency> + + <!-- GROBID Dependencies - stará verzia + <dependency> + <groupId>org.grobid</groupId> + <artifactId>grobid-core</artifactId> + <version>0.8.1</version> + </dependency> + --> + + <!-- Apache Commons IO (pre prácu so sĂşbormi) --> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.11.0</version> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.springframework.boot</groupId> + <artifactId>spring-boot-maven-plugin</artifactId> + </plugin> + </plugins> + </build> + +</project> diff --git a/src/main/java/com/dre0059/articleprocessor/ArticleProcessorApplication.java b/src/main/java/com/dre0059/articleprocessor/ArticleProcessorApplication.java new file mode 100644 index 0000000..53a5f87 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/ArticleProcessorApplication.java @@ -0,0 +1,13 @@ +package com.dre0059.articleprocessor; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + +@SpringBootApplication +public class ArticleProcessorApplication { + + public static void main(String[] args) { + SpringApplication.run(ArticleProcessorApplication.class, args); + } + +} diff --git a/src/main/java/com/dre0059/articleprocessor/GrobidClient.java b/src/main/java/com/dre0059/articleprocessor/GrobidClient.java new file mode 100644 index 0000000..91ff786 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/GrobidClient.java @@ -0,0 +1,44 @@ +package com.dre0059.articleprocessor; + +import org.springframework.http.MediaType; +import org.springframework.stereotype.Service; +import org.springframework.web.reactive.function.BodyInserters; +import org.springframework.web.reactive.function.client.WebClient; +import org.springframework.core.io.FileSystemResource; +import reactor.core.publisher.Mono; + +import java.io.File; + + +@Service +public class GrobidClient { + private final WebClient webClient; + + public GrobidClient() { + this.webClient = WebClient.builder() + .baseUrl("http://158.196.98.65:8080") // URL kde bežà GROBID server + .build(); + } + + // get METADATA of the file + public Mono<String> processHeader(File pdfFile){ // Mono - vráti jeden string, vĂ˝sledok je JSON + return webClient.post() + .uri("/api/processHeaderDocument") + .contentType(MediaType.MULTIPART_FORM_DATA) + .body(BodyInserters.fromMultipartData("input", new FileSystemResource(pdfFile))) + .attribute("consolidateHeader", 1) + .retrieve() + .bodyToMono(String.class); + } + + // spracuje REFERENCIE z PDF + public Mono<String> processReferences(File pdfFile){ + return webClient.post() + .uri("/api/processReferences") + .contentType(MediaType.MULTIPART_FORM_DATA) + .body(BodyInserters.fromMultipartData("input", new FileSystemResource(pdfFile))) + .retrieve() + .bodyToMono(String.class); + } + +} \ No newline at end of file diff --git a/src/main/java/com/dre0059/articleprocessor/apacheTika/PDFbox.java b/src/main/java/com/dre0059/articleprocessor/apacheTika/PDFbox.java new file mode 100644 index 0000000..f520ae5 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/apacheTika/PDFbox.java @@ -0,0 +1,40 @@ +package com.dre0059.articleprocessor.apacheTika; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; + +import static com.lowagie.text.html.HtmlTagMap.isTitle; + +@Component +public class PDFbox { + private int countTXT = 0; + public int getCountTXT(){ return this.countTXT; } + + public void toTXT(String inputPDFpath, String outputTXTpath){ + File pdfFile = new File(inputPDFpath); + + try(PDDocument document = PDDocument.load(pdfFile)){ + PDFTextStripper pdfStripper = new PDFTextStripper(); + String extractedText = pdfStripper.getText(document); + + //String[] lines = extractedText.split("\n"); + //StringBuilder contentWithHeaders = new StringBuilder(); + + try (FileWriter writer = new FileWriter(outputTXTpath)) { + writer.write(extractedText); + System.out.println("File was sucessfully saved to : " + outputTXTpath); + countTXT++; + } catch (IOException e) { + System.err.println("FAILURE - file was not saved : " + e.getMessage()); + } + + } catch (IOException ex) { + System.err.println("FAILURE - Problem kin reading file : " + ex.getMessage()); + } + } +} diff --git a/src/main/java/com/dre0059/articleprocessor/apacheTika/ReferencesScanner.java b/src/main/java/com/dre0059/articleprocessor/apacheTika/ReferencesScanner.java new file mode 100644 index 0000000..279e405 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/apacheTika/ReferencesScanner.java @@ -0,0 +1,177 @@ +package com.dre0059.articleprocessor.apacheTika; + +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; + +import javax.sound.midi.Soundbank; +import java.io.*; +import java.util.ArrayList; +import java.util.Scanner; +import java.util.Vector; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class ReferencesScanner { + + // private Vector<String> references; + // notFound is just for TESTING + private int notFound; + private int notParsed; + public int getNotFound(){ + return this.notFound; + } + public int getNotParsed(){ return this.notParsed; } + + + public boolean findReferences(String filepath, String outputPath) { + Tika tika = new Tika(); + String content; + + try { + content = tika.parseToString(new File(filepath)); + String[] lines = content.split("\\r?\\n"); // split text to lines + + boolean found = false; + StringBuilder referencesData = new StringBuilder(); + + // Regular expression for "References" and variations + + String regex = "\\b[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]?\\b|" + + "\\b[Nn][Oo][Tt][Ee][Ss]?\\s+[Aa][Nn][Dd]\\s+[Rr][Ee][Ff][Ee][Rr][Ee][Nn][Cc][Ee][Ss]?\\b"; + + // the most suitable for finding the references + //String regex = "(?i)(\\d*\\s*References|References|R\\s*E\\s*F\\s*E\\s*R\\s*E\\s*N\\s*C\\s*E\\s*S|Notes\\s+and\\s+References|RE[FE]*R[EE]*N[C]*E[S]*|^\\s*REFERENCES\\s*$)"; + Pattern pattern = Pattern.compile(regex); + + for (String line : lines) { + if (found) { + // TODO : FURTHER READING is in the end of the line and in the beggining of the other line (this below is not working) + if (line.contains("APPENDIX") || line.contains("Appendix") || line.contains("FURTHER\nREADING")) + break; + // Append lines of references + referencesData.append(line).append(System.lineSeparator()); + } else { + // Check if line contains references + Matcher matcher = pattern.matcher(line); + if (matcher.find()) { + found = true; // Reference found + referencesData.append(line).append(System.lineSeparator()); + } + } + } + + // Save references to TXT file + if (found) { + try (FileWriter writer = new FileWriter(outputPath)) { + writer.write(referencesData.toString()); + System.out.println("References found and saved to " + outputPath); + } catch (IOException e) { + e.printStackTrace(); + } + return true; // Return true indicating references were found + } else { + System.out.println("References NOT found in file: " + filepath); + notFound++; + return false; // Return false indicating no references were found + } + + } catch (IOException | TikaException e) { + e.printStackTrace(); + return false; // Return false if an exception occurs + } + } + // parse references to vector + public Vector<String> parseReferences(String inputReferencesPath, String outputPath) throws IOException { + Vector<String> parsedReferences = new Vector<>(); + + FileInputStream stream = null; + try{ + stream = new FileInputStream(inputReferencesPath); + } catch (FileNotFoundException e) { + throw new RuntimeException("File not found " + inputReferencesPath, e); + } + + Scanner scanner = new Scanner(stream); + String line = null; + + int index = 1; // number of current reference + boolean found = false; // used for lines behind found reference + + while (scanner.hasNextLine()){ + // first reference (from second until the last, it will be true - because we dont wanna go to another line, we wanna stay on line with next [ i ] ) + if(!found) + line = scanner.nextLine(); + + // regex for patterns : 1 1. (1) [1] ... s* - spaces + //String regex = "(\\(\\s*\\b" + index + "\\b\\s*\\))|(\\[\\s*\\b " + index +"\\b\\s*\\])|(\\b" + index + "\\b\\s*\\.)|(\\b" + index + "\\b)"; + + String regex = "(\\(\\s*" + index + "\\s*\\))" // Formát (index) + + "|(\\[\\s*" + index + "\\s*\\])" // Formát [index] + + "|(\\b" + index + "\\b\\s*\\.)" // Formát index. + + "|(\\b" + index + "\\b)"; // SamotnĂ© ÄŤĂslo index + + + Pattern pattern = Pattern.compile(regex); // regular expression + Matcher matcher = pattern.matcher(line); // matcher for comparing regular exrpession + + // [ i ] found, add reference + if(matcher.find()){ + StringBuilder currReference = new StringBuilder(); + currReference.append(line); // append line which contains [ 1 ] + + index++; // regex searches for 2 instead of 1 + //regex = "(\\(\\s*\\b" + index + "\\b\\s*\\))|(\\[\\s*\\b " + index +"\\b\\s*\\])|(\\b" + index + "\\b\\s*\\.)|(\\b" + index + "\\b\\s+)"; // [ 2 ] + regex = "(\\(\\s*" + index + "\\s*\\))" // Formát (index) + + "|(\\[\\s*" + index + "\\s*\\])" // Formát [index] + + "|(\\b" + index + "\\b\\s*\\.)" // Formát index. + + "|(\\b" + index + "\\b\\s+)"; // SamotnĂ© ÄŤĂslo index + + pattern = Pattern.compile(regex); + + //line = scanner.nextLine(); + //matcher = pattern.matcher(line); // looking for [ 2 ] on the next line + + while(scanner.hasNextLine()){ // all lines without regex (these lines belong to first reference) + line = scanner.nextLine(); + matcher = pattern.matcher(line); // looking for [ 2 ] on the next line + + if(matcher.find()){ // [ 2 ] was found + found = true; + break; + } + currReference.append(" ").append(line); // [ 2 ] was not found, lines belong to the first reference + } + //System.out.println("Match was found\n"); + + parsedReferences.add(currReference.toString()); // add the whole reference [ 1 ] to vector + } + } + + scanner.close(); + + int i = 0; + try(FileWriter writer = new FileWriter(outputPath)) { + for (String ref : parsedReferences) { + i++; + writer.write(i + ". " + ref + "\n"); + //System.out.println(i + " " + ref); + } + }catch (IOException e){ + e.printStackTrace(); + } + + if(!(parsedReferences.size() > 0)){ + notParsed++; + System.out.println("References NOT parsed in file: " + inputReferencesPath); + } + return parsedReferences; + } + + // pslit reference and get NAME and year out of it + public void splitReferences(Vector<String> oneDocumentReferences){ + for(String ref : oneDocumentReferences){ + System.out.println(ref + "\n"); + } + } +} diff --git a/src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java b/src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java new file mode 100644 index 0000000..91bfab5 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java @@ -0,0 +1,105 @@ +package com.dre0059.articleprocessor.controller; + +import com.dre0059.articleprocessor.GrobidClient; +import com.dre0059.articleprocessor.model.DocumentMetadata; +import com.dre0059.articleprocessor.service.MetadataParser; +import com.dre0059.articleprocessor.repository.DocumentRepository; +import com.dre0059.articleprocessor.repository.ReferenceRepository; +import com.dre0059.articleprocessor.service.TEIparser; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.stereotype.Controller; +import org.springframework.ui.Model; +import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; +import reactor.core.publisher.Mono; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +// TODO : +// 1. âś… nefunguje mi správne uloĹľenie ÄŤlánku, pokiaÄľ uĹľ ÄŤlánok v DBS je, aktuálne mi vyhodĂ len ERROR Ĺľe nemoĹľno správne spracovaĹĄ +// 2. âś… !!! uloĹľenie referenciĂ do databázy +// 3. prepojĂm referenciu s uloĹľenĂ˝mi ÄŤlánkami ??? +// 4. viac spraviĹĄ program USER-FRIENDLY - vĂ˝pis Ĺľe spracovávam document, vĂ˝pis Ĺľe dokument uĹľ je uloĹľenĂ˝, vĂ˝pis Ĺľe dokument sa uloĹľil a vypĂšem metadata pre overenie +// 5. nesprávne vyĹĄahovanie referenciĂ - referencie ktorĂ© sa odkazujĂş na nejakĂ˝ web, nie sĂş spracovanĂ© + +@Controller +@RequestMapping("/api/grobid") +public class FileUploadController { + private static final Logger logger = LoggerFactory.getLogger(FileUploadController.class); + private final GrobidClient grobidClient; + private final DocumentRepository metadataRepository; + private final ReferenceRepository referenceRepository; + + public FileUploadController(GrobidClient grobidClient, DocumentRepository metadataRepository, ReferenceRepository referenceRepository) { + this.grobidClient = grobidClient; + this.metadataRepository = metadataRepository; + this.referenceRepository = referenceRepository; + } + + @GetMapping("/upload") + public String showUploadForm(Model model) { + return "upload"; // vracia upload.html + } + + @PostMapping("/upload") + @ResponseBody + public Mono<ResponseEntity<Map<String, String>>> handleFileUpload(@RequestParam("file") MultipartFile file) { + logger.info("Received file: {}", file.getOriginalFilename()); + + return Mono.fromCallable(() -> { + Path tempFile = Files.createTempFile("upload-", ".pdf"); + file.transferTo(tempFile.toFile()); + return tempFile.toFile(); + }).flatMap(pdfFile -> { + Mono<String> metadataMono = grobidClient.processHeader(pdfFile); + Mono<String> referencesMono = grobidClient.processReferences(pdfFile); + + return Mono.zip(metadataMono, referencesMono) + .flatMap(result -> { + String metadataJson = result.getT1(); + String referencesXml = result.getT2(); + + String title = MetadataParser.extractTitle(metadataJson); + List<String> authors = MetadataParser.extractAuthors(metadataJson); + + return Mono.justOrEmpty(metadataRepository.findByTitle(title)) + .map(existing -> { + logger.warn("Article with title '{}' already exists!", title); + return ResponseEntity.status(HttpStatus.CONFLICT) + .body(Map.of("error", "Article is already in database.")); + }) + .switchIfEmpty(Mono.fromCallable(() -> { + DocumentMetadata doc = new DocumentMetadata(title, authors); + metadataRepository.save(doc); + + // Spracovanie referenciĂ cez TEIparser + TEIparser teiParser = new TEIparser(referenceRepository); + teiParser.parseAndSaveToDB(referencesXml, doc); + + Map<String, String> response = new HashMap<>(); + response.put("metadata", metadataJson); + response.put("references", referencesXml); + + return ResponseEntity.ok(response); + })); + }) + .onErrorResume(e -> { + logger.error("Error processing PDF", e); + return Mono.just(ResponseEntity.internalServerError().body(Map.of("error", "Failed to process PDF"))); + }); + }); + } + + +} + + + diff --git a/src/main/java/com/dre0059/articleprocessor/model/Author.java b/src/main/java/com/dre0059/articleprocessor/model/Author.java new file mode 100644 index 0000000..69329b9 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/model/Author.java @@ -0,0 +1,36 @@ +package com.dre0059.articleprocessor.model; + +import jakarta.persistence.*; + +import java.util.ArrayList; +import java.util.List; + +@Entity +@Table(name = "authors") +public class Author { + + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Long id; + + private String name; + private String surname; + + @ManyToMany(mappedBy = "authors") + private List<Document> documents = new ArrayList<Document>(); + + public Author(){} + public Author(String name, String surname) { + this.name = name; + this.surname = surname; + } + + public Long getId() { return id; } + public String getName() { return name; } + public String getSurname() { return surname; } + public List<Document> getDocuments() { return documents; } + + public void setName(String name) { this.name = name; } + public void setSurname(String surname) { this.surname = surname; } + public void setDocuments(List<Document> documents) { this.documents = documents; } +} diff --git a/src/main/java/com/dre0059/articleprocessor/model/Document.java b/src/main/java/com/dre0059/articleprocessor/model/Document.java new file mode 100644 index 0000000..a017029 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/model/Document.java @@ -0,0 +1,62 @@ +package com.dre0059.articleprocessor.model; + +import jakarta.persistence.*; +import org.hibernate.annotations.CollectionId; +import com.dre0059.articleprocessor.model.*; + +import java.util.ArrayList; +import java.util.List; + +@Entity +@Table(name = "documents") +public class Document { + + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Long id; + + private String title; + private Integer year; + private String doi; + + @Column(name = "abstractText") + private String abstractText; + + private Integer pages; + private String publisher; + + @OneToMany(mappedBy = "fromDocument", cascade = CascadeType.ALL) + private List<Reference> references = new ArrayList<>(); + + @ManyToMany + @JoinTable( + name = "document_author", + joinColumns = @JoinColumn(name = "ID_document"), + inverseJoinColumns = @JoinColumn(name = "ID_author") + ) + private List<Author> authors = new ArrayList<>(); + + public Document() {} + + public Document(String title, Integer year, String doi, String abstractText, Integer pages, String publisher) { + this.title = title; + this.year = year; + this.doi = doi; + this.abstractText = abstractText; + this.pages = pages; + this.publisher = publisher; + } + + public Long getId() { return id; } + public String getTitle() { return title; } + public Integer getYear() { return year; } + public String getDoi() { return doi; } + public String getAbstractText() { return abstractText; } + public Integer getPages() { return pages; } + public String getPublisher() { return publisher; } + public List<Reference> getReferences() { return references; } + public List<Author> getAuthors() { return authors; } + + public void setAuthors(List<Author> authors) { this.authors = authors; } + +} diff --git a/src/main/java/com/dre0059/articleprocessor/model/DocumentMetadata.java b/src/main/java/com/dre0059/articleprocessor/model/DocumentMetadata.java new file mode 100644 index 0000000..1aa86f9 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/model/DocumentMetadata.java @@ -0,0 +1,37 @@ +package com.dre0059.articleprocessor.model; + +import jakarta.persistence.*; + +import java.util.ArrayList; +import java.util.List; + +@Entity // DBS table +@Table (name = "DOCUMENT_METADATA", uniqueConstraints = @UniqueConstraint(columnNames = "title")) +public class DocumentMetadata { + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) // ID is generated automatically + private Long id; + + private String title; + + @ElementCollection // pomocna tabulka authors + private List<String> authors = new ArrayList<>(); + + // needed for Hibernate for right instances in DBS + public DocumentMetadata() {} + + public DocumentMetadata(String title, List<String> authors) { + this.title = title; + this.authors = authors; + } + + public Long getId(){ + return id; + } + public String getTitle(){ + return title; + } + public List<String> getAuthors(){ + return authors; + } +} diff --git a/src/main/java/com/dre0059/articleprocessor/model/Reference.java b/src/main/java/com/dre0059/articleprocessor/model/Reference.java new file mode 100644 index 0000000..5aae7ce --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/model/Reference.java @@ -0,0 +1,41 @@ +package com.dre0059.articleprocessor.model; + +import jakarta.persistence.*; + +import javax.print.Doc; +import java.util.List; + +@Entity +@Table(name = "references") +public class Reference { + @Id + @GeneratedValue(strategy = GenerationType.IDENTITY) + private Long id; + + // number or letters in reference list + private String orderNumber; + + @ManyToOne(cascade = CascadeType.ALL) + @JoinColumn(name = "ID_fromDocument") + private Document fromDocument; + + @ManyToOne(cascade = CascadeType.ALL) + @JoinColumn(name = "ID_toDocument") + private Document toDocument; + + public Reference() {} + public Reference(String orderNumber, Document fromDocument, Document toDocument) { + this.orderNumber = orderNumber; + this.fromDocument = fromDocument; + this.toDocument = toDocument; + } + + public String getOrderNumber() { return orderNumber; } + public Document getFromDocument() { return fromDocument; } + public Document getToDocument() { return toDocument; } + public Long getId() { return id; } + + public void setFromDocument(Document fromDocument) { this.fromDocument = fromDocument; } + public void setToDocument(Document toDocument) { this.toDocument = toDocument; } + public void setOrderNumber(String orderNumber) { this.orderNumber = orderNumber; } +} diff --git a/src/main/java/com/dre0059/articleprocessor/repository/AuthorRepository.java b/src/main/java/com/dre0059/articleprocessor/repository/AuthorRepository.java new file mode 100644 index 0000000..846a3fa --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/repository/AuthorRepository.java @@ -0,0 +1,12 @@ +package com.dre0059.articleprocessor.repository; + +import com.dre0059.articleprocessor.model.Author; +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; + +import java.util.Optional; + +@Repository +public interface AuthorRepository extends JpaRepository<Author, Long> { + Optional<Author> findByFullname(String name, String surname); +} diff --git a/src/main/java/com/dre0059/articleprocessor/repository/DocumentRepository.java b/src/main/java/com/dre0059/articleprocessor/repository/DocumentRepository.java new file mode 100644 index 0000000..d68a201 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/repository/DocumentRepository.java @@ -0,0 +1,15 @@ +package com.dre0059.articleprocessor.repository; + +import com.dre0059.articleprocessor.model.DocumentMetadata; + +import org.springframework.data.jpa.repository.JpaRepository; +import org.springframework.stereotype.Repository; +import org.w3c.dom.Document; + +import java.util.Optional; + +// uklada extrahovane data +@Repository +public interface DocumentRepository extends JpaRepository<Document, Long> { + Optional<Document> findByTitleAndAuthorsSurname(String title, String surname); +} diff --git a/src/main/java/com/dre0059/articleprocessor/repository/ReferenceRepository.java b/src/main/java/com/dre0059/articleprocessor/repository/ReferenceRepository.java new file mode 100644 index 0000000..f5cee29 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/repository/ReferenceRepository.java @@ -0,0 +1,9 @@ +package com.dre0059.articleprocessor.repository; + +import com.dre0059.articleprocessor.model.Reference; +import org.springframework.data.jpa.repository.JpaRepository; + +import java.util.List; + +public interface ReferenceRepository extends JpaRepository<Reference, Long> { +} diff --git a/src/main/java/com/dre0059/articleprocessor/service/DocumentService.java b/src/main/java/com/dre0059/articleprocessor/service/DocumentService.java new file mode 100644 index 0000000..9b3c6ca --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/service/DocumentService.java @@ -0,0 +1,4 @@ +package com.dre0059.articleprocessor.service; + +public class DocumentService { +} diff --git a/src/main/java/com/dre0059/articleprocessor/service/MetadataParser.java b/src/main/java/com/dre0059/articleprocessor/service/MetadataParser.java new file mode 100644 index 0000000..2cfc04b --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/service/MetadataParser.java @@ -0,0 +1,70 @@ +package com.dre0059.articleprocessor.service; + +import com.dre0059.articleprocessor.model.Author; +import com.dre0059.articleprocessor.model.Document; +import com.dre0059.articleprocessor.repository.AuthorRepository; +import com.dre0059.articleprocessor.repository.DocumentRepository; +import com.dre0059.articleprocessor.repository.ReferenceRepository; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Service; + +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Service +public class MetadataParser { + + @Autowired + private DocumentRepository documentRepository; + @Autowired + private AuthorRepository authorRepository; + + @Autowired + public MetadataParser(DocumentRepository documentRepository, AuthorRepository authorRepository) { + this.documentRepository = documentRepository; + this.authorRepository = authorRepository; + } + + public Document parseBibTeX(String bibtexString) { + // Regular expression pre zĂskanie hodnĂ´t z BibTeX formátu + Pattern pattern = Pattern.compile("@.*?\\{.*?,\\s*author\\s*=\\s*\\{(.*?)\\},\\s*title\\s*=\\s*\\{(.*?)\\},\\s*doi\\s*=\\s*\\{(.*?)\\},\\s*abstract\\s*=\\s*\\{(.*?)\\}"); + Matcher matcher = pattern.matcher(bibtexString); + + if (matcher.find()) { + String authorsRaw = matcher.group(1); + String title = matcher.group(2); + String doi = matcher.group(3); + String abstractText = matcher.group(4); + + List<Author> authors = parseAuthors(authorsRaw); + + Document document = new Document(title, null, doi, abstractText, null, null); + document.setAuthors(authors); + + documentRepository.save(document); + + for (Author author : authors) { + authorRepository.save(author); + } + + return document; + } + return null; + } + + private List<Author> parseAuthors(String authorsRaw) { + List<Author> authors = new ArrayList<>(); + String[] authorNames = authorsRaw.split(" and "); + for (String fullName : authorNames) { + String[] nameParts = fullName.trim().split("\\s+", 2); + if (nameParts.length == 2) { + authors.add(new Author(nameParts[1], nameParts[0])); // Priezvisko, Meno + } else { + authors.add(new Author(nameParts[0], "")); // Ak meno nemá priezvisko + } + } + return authors; + } +} diff --git a/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java b/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java new file mode 100644 index 0000000..4428ebb --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java @@ -0,0 +1,4 @@ +package com.dre0059.articleprocessor.service; + +public class ReferenceService { +} diff --git a/src/main/java/com/dre0059/articleprocessor/service/TEIparser.java b/src/main/java/com/dre0059/articleprocessor/service/TEIparser.java new file mode 100644 index 0000000..b39d4bd --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/service/TEIparser.java @@ -0,0 +1,95 @@ +package com.dre0059.articleprocessor.service; + +import com.dre0059.articleprocessor.model.DocumentMetadata; +import com.dre0059.articleprocessor.model.Reference; +import com.dre0059.articleprocessor.repository.DocumentRepository; +import com.dre0059.articleprocessor.repository.ReferenceRepository; +import org.springframework.stereotype.Service; +import org.w3c.dom.*; +import org.xml.sax.InputSource; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.List; + +@Service +public class TEIparser { + + + private final ReferenceRepository referenceRepository; + private final DocumentRepository documentRepository; + + public TEIparser(ReferenceRepository referenceRepository, DocumentRepository documentRepository) { + this.referenceRepository = referenceRepository; + this.documentRepository = documentRepository; + } + + public void parseAndSaveToDB(String xmlContent, DocumentMetadata document) { + try { + List<Reference> references = parseReferencesFromXML(xmlContent, document); + + if (!references.isEmpty()) { + referenceRepository.saveAll(references); + System.out.println("References successfully saved to DB"); + } else { + System.out.println("No valid references found in XML."); + } + } catch (Exception e) { + System.err.println("Error parsing references: " + e.getMessage()); + e.printStackTrace(); + } + } + + private List<Reference> parseReferencesFromXML(String xmlContent, DocumentMetadata document) { + List<Reference> references = new ArrayList<>(); + + try { + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + dbFactory.setNamespaceAware(true); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + Document doc = dBuilder.parse(new InputSource(new StringReader(xmlContent))); + + // <biblStruct> - bibliographical information (consists of all needed information) + NodeList biblStructs = doc.getElementsByTagNameNS("*", "biblStruct"); + + for (int i = 0; i < biblStructs.getLength(); i++) { + Element bibl = (Element) biblStructs.item(i); + + String title = getTagValueNS("title", bibl, "Unknown Title"); + String publisher = getTagValueNS("publisher", bibl, "Unknown Publisher"); + String year = getTagValueNS("year", bibl, "Unknown Year"); + + List<String> authors = new ArrayList<>(); + NodeList authorNodes = bibl.getElementsByTagNameNS("*", "author"); + + for (int j = 0; j < authorNodes.getLength(); j++) { + Element authorElement = (Element) authorNodes.item(j); + Element persName = (Element) authorElement.getElementsByTagNameNS("*", "persName"); + + if (persName != null) { + String forename = getTagValueNS("forename", persName, ""); + String surname = getTagValueNS("surname", persName, ""); + if (!forename.isEmpty() || !surname.isEmpty()) { + authors.add(forename + " " + surname); + } + } + + } + references.add(new Document(title, year, doi, abstractText, pages, publisher)); + + } + } catch (Exception e) { + System.err.println("Failed to parse references XML: " + e.getMessage()); + e.printStackTrace(); + } + + return references; + } + + private static String getTagValueNS(String tagName, Element element, String defaultValue) { + NodeList nodeList = element.getElementsByTagNameNS("*", tagName); + return (nodeList.getLength() > 0) ? nodeList.item(0).getTextContent().trim() : defaultValue; + } +} diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties new file mode 100644 index 0000000..c1d0451 --- /dev/null +++ b/src/main/resources/application.properties @@ -0,0 +1,44 @@ +server.port=8080 +spring.application.name=articleProcessor + +#spring.datasource.url=jdbc:sqlite:pdf_database.db +#spring.datasource.driver-class-name=org.sqlite.JDBC +#spring.jpa.database-platform=org.hibernate.dialect.SQLiteDialect +#spring.jpa.hibernate.ddl-auto=update + +#spring.datasource.url=jdbc:sqlite:"D:\\Bakalarka\\my_db\\first_database.db" +#driverClassName=org.sqlite.JDBC +#url=jdbc:sqlite:memory:myDb?cache=shared +#username=sa +#password=sa +#spring.jpa.database-platform=org.hibernate.community.dialect.SQLiteDialect +#spring.jpa.hibernate.ddl-auto=create-drop +#spring.jpa.show-sql=true + + +# SQLite database configuration +#spring.datasource.url=jdbc:sqlite:pdf_database.db +#spring.jpa.database-platform=org.hibernate.dialect.SQLiteDialect +#spring.jpa.hibernate.ddl-auto=create-drop +#spring.jpa.show-sql=true + +#spring.datasource.url=jdbc:h2:mem:testdb +spring.datasource.url=jdbc:h2:file:/data/demo +spring.datasource.driverClassName=org.h2.Driver +spring.datasource.username=sa +spring.datasource.password=password +spring.jpa.database-platform=org.hibernate.dialect.H2Dialect +spring.jpa.defer-datasource-initialization=true + +spring.h2.console.enabled=true + +spring.jpa.hibernate.ddl-auto=update +spring.jpa.show-sql=true +spring.jpa.properties.hibernate.format_sql=true + +spring.thymeleaf.prefix=classpath:/templates/ +spring.thymeleaf.suffix=.html + +# Spring MVC for uploading PDF files +spring.servlet.multipart.max-file-size=5MB +spring.servlet.multipart.max-request-size=5MB diff --git a/src/main/resources/templates/upload-success.html b/src/main/resources/templates/upload-success.html new file mode 100644 index 0000000..98923b0 --- /dev/null +++ b/src/main/resources/templates/upload-success.html @@ -0,0 +1,88 @@ +<!DOCTYPE html> +<html xmlns:th="http://www.thymeleaf.org"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>Upload Success</title> +</head> +<body> +<h1>Upload Successful</h1> +<p th:text="${message}"></p> +<p>Converted text file: <a th:href="@{${txtFilePath}}" th:text="${txtFilePath}"></a></p> +<a href="/upload">Go back to upload another file</a> + +<p>Are the following references correct?</p> +<button id="yesButton">Yes</button> +<button id="noButton">No</button> + +<div id="references" style="display: block;"> + <!-- Zobrazenie textu referenciĂ --> + <div th:utext="${referencesText}"></div> +</div> + +<div id="fullText" style="display: none;"> + <h3>Full text:</h3> + <pre th:utext="${fullText}" id="fullTextContent" style="border: 1px solid #ccc; padding: 10px; cursor: text;"></pre> + <button id="saveSelectedText">Save Selected References</button> +</div> + +<button id="showReferences">Show saved references</button> +<div id = "savedReferences" ></div> + +<script> + // Zobrazenie celĂ©ho textu pri kliknutĂ na "No" + document.getElementById("noButton").addEventListener("click", function(){ + document.getElementById("fullText").style.display = "block"; + document.getElementById("references").style.display = "none"; + }); + + // UloĹľenie vybranĂ©ho textu + document.getElementById("saveSelectedText").addEventListener("click", function () { + // ZĂskanie vybranĂ©ho textu + const selection = window.getSelection().toString().trim(); + + if (!selection) { + alert("Please select some text to save."); + return; + } + + // Poslanie vybranĂ©ho textu na server + fetch('/saveReferences', { + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ selectedText: selection }), + }) + .then(response => { + if (response.ok) { + alert("Selected references saved successfully."); + } else { + return response.text().then(err => { throw new Error(err); }); + } + }) + .catch(error => console.error("Error saving references:", error)); + }); + + document.getElementById("showReferences").addEventListener("click", function () { + fetch('/getReferences') + .then(response => { + if(!response.ok) { + return response.text().then(err => { + throw new Error(err); + }); + } + return response.text(); // references as one string + }) + .then(data => { + const savedReferencesDiv = document.getElementById("savedReferences"); + console.log(data); + savedReferencesDiv.innerHTML = data.replace(/\n/g, '<br>'); // replace newLine for HTML tag + }) + .catch(error => console.error("Error fetching references: ", error)); + }); + + +</script> +</body> +</html> diff --git a/src/main/resources/templates/upload.html b/src/main/resources/templates/upload.html new file mode 100644 index 0000000..dd6b70a --- /dev/null +++ b/src/main/resources/templates/upload.html @@ -0,0 +1,83 @@ +<!DOCTYPE html> +<html xmlns:th="http://www.thymeleaf.org"> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>Upload PDF</title> + <script src = "https://code.jquery.com/jquery-3.6.0.min.js"></script> + <style> + #pdf-preview { + width: 500px; + height: 600px; + border: 1px solid #ddd; + margin-top: 10px; + } + </style> + </head> + + <body> + <h1>Upload your PDF</h1> + + <!-- formular na nahravanie PDF --> + + <form action="/api/grobid/upload" method="post" id = "uploadForm" enctype = "multipart/form-data"> + <label for="fileInput">Choose PDF file:</label> + <input type = "file" id = "fileInput" name = "file" accept="application/pdf"> + <button type = "submit">Upload & Process the PDF</button> + </form> + + <!-- zobrazenie PDF --> + <div id="pdf-container"> + <iframe id="pdf-preview" src="" style="display: none;"></iframe> + </div> + + <!-- JSON vĂ˝stup --> + <div id = "response-containter" style="display: none;"> + <h3>Response : </h3> + <pre id = "json-output"></pre> + </div> + + <!-- .js na zobrazenie PDF --> + <script> + document.getElementById('fileInput').addEventListener('change', function(event){ + const file = event.target.files[0]; + if(file && file.type === 'application/pdf'){ + const fileURL = URL.createObjectURL(file); + document.getElementById('pdf-preview').src = fileURL; + document.getElementById('pdf-preview').style.display = 'block'; + } else { + alert ('Please, choose valid PDF file.'); + } + }); + + // AJAX for sending a file + $('#uploadForm').submit(function(event) { + event.preventDefault(); + const fileInput = $('#fileInput')[0].files[0]; + + if(!fileInput){ + alert("Please select a PDF file first."); + return; + } + + const formData = new FormData(); + formData.append("file", fileInput); + + $.ajax({ + url: "/api/grobid/upload", + type : "POST", + data : formData, + processData : false, + contentType : false, + success : function(response){ + $('#json-output').text(JSON.stringify(response, null, 4)); + $('#response-containter').show(); + }, + error: function(){ + alert("Error processing PDF."); + } + }); + }); + </script> + </body> +</html> diff --git a/src/test/java/com/dre0059/articleprocessor/ArticleProcessorApplicationTests.java b/src/test/java/com/dre0059/articleprocessor/ArticleProcessorApplicationTests.java new file mode 100644 index 0000000..9bb5c39 --- /dev/null +++ b/src/test/java/com/dre0059/articleprocessor/ArticleProcessorApplicationTests.java @@ -0,0 +1,13 @@ +package com.dre0059.articleprocessor; + +import org.junit.jupiter.api.Test; +import org.springframework.boot.test.context.SpringBootTest; + +@SpringBootTest +class ArticleProcessorApplicationTests { + + @Test + void contextLoads() { + } + +} -- GitLab