From dc1846d774cf694430a3b987fa8bacd2f688e6d0 Mon Sep 17 00:00:00 2001 From: dre0059 <eliska.dreveniakova@vsb.cz> Date: Wed, 30 Apr 2025 19:26:38 +0200 Subject: [PATCH] =?UTF-8?q?Dokument=C3=A1cia=20update?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 10 ++-- .../articleprocessor/GrobidClient.java | 53 ++++++++++++------- .../service/ReferenceService.java | 40 ++++++++++++-- .../service/TEINamespaceContext.java | 16 ++++++ .../resources/templates/about-project.html | 10 ++++ .../templates/citation-timeline.html | 11 +++- .../resources/templates/more-citations.html | 10 ++++ src/main/resources/templates/statistics.html | 9 ++++ src/main/resources/templates/upload.html | 11 ++++ src/main/resources/templates/view-all.html | 9 ++++ src/main/resources/templates/view-pdf.html | 10 ++++ 11 files changed, 163 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 3e72bff..9a4162f 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,15 @@ -# Articleprocessor GROBID version 1.0 +# Articleprocessor GROBID version - The main program runs at : *http://localhost:8080/upload* or *http://localhost:8080* - H2 database can be found on : *http://localhost:8080/h2-console/login.jsp?jsessionid=9af0ea4b83284ff0a4574769b0336943* - Password for the DBS can be found in : `resources/application.properties` -- GROBID server available on address *http://158.196.98.65:8080/* at university network +- GROBID server available on address *http://158.196.98.65:8080/* in university network + + +_VĹ B-TUO 2025_ + +_Eliška Kozáčiková - DRE0059_ ---------- --------- diff --git a/src/main/java/com/dre0059/articleprocessor/GrobidClient.java b/src/main/java/com/dre0059/articleprocessor/GrobidClient.java index ccb56ec..93da170 100644 --- a/src/main/java/com/dre0059/articleprocessor/GrobidClient.java +++ b/src/main/java/com/dre0059/articleprocessor/GrobidClient.java @@ -20,19 +20,39 @@ import reactor.core.publisher.Mono; import java.io.File; import java.net.ConnectException; - +/** + * Tento klient pouĹľĂva WebClient na odosielanie poĹľiadaviek + * na server GROBID a zĂskavanie spracovanĂ˝ch metadát, + * referenciĂ a hlaviÄŤiek z PDF dokumentov. + * * + * Implementuje metĂłdy na komunikáciu s API Grobid servera pre rĂ´zne + * typy analĂ˝z PDF dokumentov. + */ @Service public class GrobidClient { private final WebClient webClient; - public GrobidClient(GrobidProperties grobidProperties) { + /** + * Konštrukto, ktorĂ˝ inicializuje WebClient pre komunikáciu s GROBID serverom + * + * @param grobidProperties Konfigurácia obsahujĂşca URL hostiteÄľa GROBID servera. + */ + public GrobidClient(GrobidProperties grobidProperties) { - this.webClient = WebClient.builder() - .baseUrl(grobidProperties.getHost()) // URL kde bežà GROBID server - .build(); - } + this.webClient = WebClient.builder() + .baseUrl(grobidProperties.getHost()) // URL kde bežà GROBID server + .build(); + } - // get METADATA of the file + /** + * Posiela poĹľiadavku na server GROBID na spracovanie hlaviÄŤky PDF dokumentu. + * MetĂłda zĂska základnĂ© informácie o dokumente + * (názov dokumentu, autorov a ÄŹalšie metadáta z hlaviÄŤky) + * + * @param pdfFile PDF sĂşbor, ktorĂ˝ sa má spracovaĹĄ. + * @return JSON reĹĄazec obsahujĂşci extrahovanĂ© metadáta o dokumente. + * @throws RuntimeException ak nastane problĂ©m pri komunikácii so serverom GROBID alebo pri spracovanĂ odpovede. + */ public String processHeader(File pdfFile){ // Mono - vráti jeden string, vĂ˝sledok je JSON try { @@ -59,18 +79,15 @@ public class GrobidClient { } } - public String processFullMetadata(File pdfFile) { - return webClient.post() - .uri("/api/processFullMetadata") - .contentType(MediaType.MULTIPART_FORM_DATA) - .body(BodyInserters.fromMultipartData("input", new FileSystemResource(pdfFile))) - .retrieve() - .bodyToMono(String.class) - .block(); - } - - // spracuje REFERENCIE z PDF + /** + * Posiela poĹľiadavku na server GROBID + * na SPRACOVANIE REFERENCIĂŤ v PDF dokumente. + * + * @param pdfFile PDF sĂşbor, ktorĂ˝ sa má spracovaĹĄ. + * @return JSON reĹĄazec obsahujĂşci extrahovanĂ© bibliografickĂ© referencie. + * @throws RuntimeException ak nastane problĂ©m pri komunikácii so serverom GROBID alebo pri spracovanĂ odpovede. + */ public String processReferences(File pdfFile){ try { return webClient.post() diff --git a/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java b/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java index 875c65c..1ac5685 100644 --- a/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java +++ b/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java @@ -13,6 +13,7 @@ import com.dre0059.articleprocessor.model.Reference; import com.dre0059.articleprocessor.repository.AuthorRepository; import com.dre0059.articleprocessor.repository.DocumentRepository; import com.dre0059.articleprocessor.repository.ReferenceRepository; +import lombok.Setter; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; import org.w3c.dom.Document; @@ -31,7 +32,17 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +/** + * + * Trieda na extrahovanie a ukladanie referenciĂ medzi dokumentmi. + * * + * Táto trieda : + * - spracováva XML dáta s referenciami (zĂskanĂ© z GROBID servera) + * - vyhÄľadáva v databáze existujĂşce dokumenty a autorov + * - vytvára novĂ© referencie medzi dokumentami + */ @Service +@Setter public class ReferenceService { private final DocumentRepository documentRepository; @@ -48,15 +59,31 @@ public class ReferenceService { this.referenceRepository = referenceRepository; } + /** + * NastavĂ dokument, z ktorĂ©ho sa cituje + (dokument v ktorom sa zoznam referenciĂ nachádzal = PDF dokument) + * + * @param fromDocument Dokument, z ktorĂ©ho sa referencuje + */ public void setFromDocument(Dokument fromDocument) { this.fromDocument = fromDocument; System.out.println("From document: " + fromDocument.getTitle()); } + /** + * NastavĂ dokument, na ktorĂ˝ je citovanĂ˝ + * + * @param doc Dokument, na ktorĂ˝ sa referencuje + */ public void setToDocument(Dokument doc){ this.toDocument = doc; } + /** + * Extrahuje referencie z XML TEI dokumentu a uložà ich do databázy. + * + * @param xmlTeiReferences XML reĹĄazec obsahujĂşci referencie + */ public void extractReferences(String xmlTeiReferences) { List<Author> databaseAuthors = this.authorRepository.findAll(); Map<String, Author> authorMap = new HashMap<>(); @@ -67,6 +94,7 @@ public class ReferenceService { } try { + // Vytvorenie DocumentBuilder pre naÄŤĂtanie XML dokumentu DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setNamespaceAware(true); DocumentBuilder builder = factory.newDocumentBuilder(); @@ -74,10 +102,12 @@ public class ReferenceService { InputSource inputSource = new InputSource(new StringReader(xmlTeiReferences)); Document doc = builder.parse(inputSource); + // Nastavenie XPath pre vyhÄľadávanie v XML XPathFactory xpathFactory = XPathFactory.newInstance(); XPath xpath = xpathFactory.newXPath(); xpath.setNamespaceContext(new TEINamespaceContext()); + // VĂ˝ber všetkĂ˝ch biblioštruktĂşr (referenciĂ) v XML dokumente NodeList biblNodes = (NodeList) xpath.evaluate("//tei:biblStruct", doc, XPathConstants.NODESET); // for each reference @@ -121,8 +151,10 @@ public class ReferenceService { String firstName = xpath.evaluate(".//tei:forename", authorNode); String lastName = xpath.evaluate(".//tei:surname", authorNode); + // Vytvorenie kľúča pre autora String authorKey = lastName.toLowerCase() + "," + firstName.toLowerCase(); + // Ak autor existuje v databáze, pridá sa do zoznamu, inak sa vytvorĂ novĂ˝ if (authorMap.containsKey(authorKey)) { authors.add(authorMap.get(authorKey)); System.out.println("Author: " + authorMap.get(authorKey) + " already exists in database."); @@ -135,15 +167,14 @@ public class ReferenceService { referencedDocument.setAuthors(authors); List<String> authorLastNames= authors.stream().map(Author::getLastname).toList(); - // check if document exists in dbs - boolean exists = documentRepository.existsByTitleAndAuthorsIn(title, authorLastNames); // check whether the document is already saved in DBS + boolean exists = documentRepository.existsByTitleAndAuthorsIn(title, authorLastNames); + if(exists){ System.out.println("Document with this title and authors already exist"); // vyhÄľadaj dokument podÄľa TITLE alebo AUTORA a nastav ho ako toDokument - referencedDocument = documentRepository.findByTitleAndAuthorsIn(title, authorLastNames) .orElseThrow(() -> new IllegalStateException("Document should exist but was NOT FOUND.")); @@ -157,7 +188,8 @@ public class ReferenceService { this.authorRepository.saveAll(authors); } - Reference reference = new Reference(/*referenceID,*/ fromDocument, toDocument); + // Vytvorenie a uloĹľenie referencie + Reference reference = new Reference(fromDocument, toDocument); // extract ID from the document String referenceID = xpath.evaluate("@*[local-name()='id']", biblNode); diff --git a/src/main/java/com/dre0059/articleprocessor/service/TEINamespaceContext.java b/src/main/java/com/dre0059/articleprocessor/service/TEINamespaceContext.java index 001f791..784132f 100644 --- a/src/main/java/com/dre0059/articleprocessor/service/TEINamespaceContext.java +++ b/src/main/java/com/dre0059/articleprocessor/service/TEINamespaceContext.java @@ -10,7 +10,19 @@ package com.dre0059.articleprocessor.service; import javax.xml.namespace.NamespaceContext; import java.util.Iterator; +/** + * Implementácia rozhrania NamespaceContext pre XPath dotazy na TEI XML dokumenty. + * * + * Táto trieda poskytuje správne názvovĂ© priestory pre XPath dotazy. + */ public class TEINamespaceContext implements NamespaceContext { + + /** + * Kontroluje, ÄŤi je prefix "tei" + * + * @param prefix prefix to check + * @return URL, ktorĂ© je názvovĂ˝m priestorom pre TEI (resp. null) + */ @Override public String getNamespaceURI(String prefix) { if ("tei".equals(prefix)) { @@ -19,6 +31,10 @@ public class TEINamespaceContext implements NamespaceContext { return null; } + /** + * povinnĂ© metĂłdy z rozhrania + * nevyuĹľĂvame tieto dve metĂłdy + */ @Override public String getPrefix(String namespaceURI) { return null; } @Override diff --git a/src/main/resources/templates/about-project.html b/src/main/resources/templates/about-project.html index f335acf..02fbd4d 100644 --- a/src/main/resources/templates/about-project.html +++ b/src/main/resources/templates/about-project.html @@ -1,3 +1,13 @@ +<!-- + Autor: Eliška Kozáčiková + Ĺ kola: VĹ B-TUO + Fakulta: Fakulta Elektrotechniky a informatiky + Dátum: 30.04.2025 + + Tento HTML dokument obsahuje informácie o projekte + +--> + <!DOCTYPE html> <html lang="en" xmlns:th="http://www.thymeleaf.org"> <head> diff --git a/src/main/resources/templates/citation-timeline.html b/src/main/resources/templates/citation-timeline.html index 6cdff9b..13783a5 100644 --- a/src/main/resources/templates/citation-timeline.html +++ b/src/main/resources/templates/citation-timeline.html @@ -1,4 +1,13 @@ - <!DOCTYPE html> +<!-- + Autor: Eliška Kozáčiková + Ĺ kola: VĹ B-TUO + Fakulta: Fakulta Elektrotechniky a informatiky + Dátum: 30.04.2025 + + Tento HTML dokument zobrazuje graf dokumentu a jeho refrenciĂ. + +--> +<!DOCTYPE html> <html xmlns:th="http://www.thymeleaf.org"> <head> <!-- zobrazuje graf ÄŤlánku na /statistics/citation-timeline?documentId={id} --> diff --git a/src/main/resources/templates/more-citations.html b/src/main/resources/templates/more-citations.html index b59b664..635abea 100644 --- a/src/main/resources/templates/more-citations.html +++ b/src/main/resources/templates/more-citations.html @@ -1,3 +1,13 @@ +<!-- + Autor: Eliška Kozáčiková + Ĺ kola: VĹ B-TUO + Fakulta: Fakulta Elektrotechniky a informatiky + Dátum: 30.04.2025 + + Tento HTML dokument zobrazuje graf viacerĂ˝ch dokumentov a ich cĂtáciĂ + +--> + <!DOCTYPE html> <html xmlns:th="http://www.thymeleaf.org"> <head> diff --git a/src/main/resources/templates/statistics.html b/src/main/resources/templates/statistics.html index 54868a1..b52459d 100644 --- a/src/main/resources/templates/statistics.html +++ b/src/main/resources/templates/statistics.html @@ -1,3 +1,12 @@ +<!-- + Autor: Eliška Kozáčiková + Ĺ kola: VĹ B-TUO + Fakulta: Fakulta Elektrotechniky a informatiky + Dátum: 30.04.2025 + + Tento HTML dokument obsahuje všetobecnĂ© štatistiky o dokumentoch v celej aplikácii + +--> <!DOCTYPE html> <html xmlns:th="http://www.thymeleaf.org"> <head> diff --git a/src/main/resources/templates/upload.html b/src/main/resources/templates/upload.html index 82e70a2..97538c3 100644 --- a/src/main/resources/templates/upload.html +++ b/src/main/resources/templates/upload.html @@ -1,3 +1,14 @@ +<!-- + Autor: Eliška Kozáčiková + Ĺ kola: VĹ B-TUO + Fakulta: Fakulta Elektrotechniky a informatiky + Dátum: 30.04.2025 + + Tento HTML dokument zobrazuje ĂşvodnĂş stránku, + umoĹľĹuje uĹľĂvateÄľovi nahraĹĄ PDF dokument, + priradiĹĄ mu kategĂłriu a tagy a odoslaĹĄ ho na spracovanie + +--> <!DOCTYPE html> <html lang="en" xmlns:th="http://www.thymeleaf.org"> <head> diff --git a/src/main/resources/templates/view-all.html b/src/main/resources/templates/view-all.html index 5150e3b..433dca4 100644 --- a/src/main/resources/templates/view-all.html +++ b/src/main/resources/templates/view-all.html @@ -1,3 +1,12 @@ +<!-- + Autor: Eliška Kozáčiková + Ĺ kola: VĹ B-TUO + Fakulta: Fakulta Elektrotechniky a informatiky + Dátum: 30.04.2025 + + Tento HTML dokument zobrazuje zoznam všetkĂ˝ch existujĂşcich ÄŤlánkov v databáze + +--> <!DOCTYPE html> <html lang="en" xmlns:th="http://www.thymeleaf.org"> <head> diff --git a/src/main/resources/templates/view-pdf.html b/src/main/resources/templates/view-pdf.html index e13be45..8cd5e87 100644 --- a/src/main/resources/templates/view-pdf.html +++ b/src/main/resources/templates/view-pdf.html @@ -1,3 +1,13 @@ +<!-- + Autor: Eliška Kozáčiková + Ĺ kola: VĹ B-TUO + Fakulta: Fakulta Elektrotechniky a informatiky + Dátum: 30.04.2025 + + Tento HTML dokument zobrazuje jeden ÄŤlánok, informácie o Ĺom a zoznam jeho referenciĂ + (referencie sĂş klikateÄľnĂ© odkazy na inĂ© ÄŤlánky v DBS) + +--> <!DOCTYPE html> <html xmlns:th="http://www.thymeleaf.org"> <head> -- GitLab