diff --git a/src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java b/src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java index 78fa9a561db0c001a9a3855e0ca7219ca04ceea1..af0c62efdcfc42665ce83b50de5c9898f95baa0d 100644 --- a/src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java +++ b/src/main/java/com/dre0059/articleprocessor/controller/FileUploadController.java @@ -3,6 +3,7 @@ package com.dre0059.articleprocessor.controller; import com.dre0059.articleprocessor.GrobidClient; import com.dre0059.articleprocessor.service.HeaderService; +import com.dre0059.articleprocessor.service.ReferenceService; import org.springframework.http.ResponseEntity; import org.springframework.stereotype.Controller; import org.springframework.ui.Model; @@ -18,18 +19,12 @@ import java.io.IOException; public class FileUploadController { private final GrobidClient grobidClient; private final HeaderService headerService; + private final ReferenceService referenceService; - //private final DocumentRepository metadataRepository; - //private final ReferenceRepository referenceRepository; - //private final MetadataParser metadataParser; - //private static final Logger logger = LoggerFactory.getLogger(FileUploadController.class); - - public FileUploadController(GrobidClient grobidClient, HeaderService headerService/*, DocumentRepository metadataRepository, ReferenceRepository referenceRepository, MetadataParser metadataParser*/) { + public FileUploadController(GrobidClient grobidClient, HeaderService headerService, ReferenceService referenceService) { this.grobidClient = grobidClient; this.headerService = headerService; - // this.metadataRepository = metadataRepository; - //this.referenceRepository = referenceRepository; - //this.metadataParser = metadataParser; + this.referenceService = referenceService; } @GetMapping("/upload") @@ -61,9 +56,10 @@ public class FileUploadController { String references = grobidClient.processReferences(tmpFile); headerService.processHeader(header); + referenceService.extractReferences(references); System.out.println(header); - //System.out.println(references); + System.out.println(references); tmpFile.delete(); diff --git a/src/main/java/com/dre0059/articleprocessor/service/HeaderService.java b/src/main/java/com/dre0059/articleprocessor/service/HeaderService.java index a4e9ace833784f699fb0b8df98a9da2e910458ad..9b24e5907805fc5f953156f057862371f054c96e 100644 --- a/src/main/java/com/dre0059/articleprocessor/service/HeaderService.java +++ b/src/main/java/com/dre0059/articleprocessor/service/HeaderService.java @@ -22,6 +22,7 @@ public class HeaderService { private final DocumentRepository documentRepository; private final AuthorRepository authorRepository; + private final ReferenceService referenceService; //public Dokument(String title, Integer year, String doi, String abstractText, Integer pages, String publisher) { @@ -36,9 +37,10 @@ public class HeaderService { private String author; @Autowired - public HeaderService(DocumentRepository documentRepository, AuthorRepository authorRepository) { + public HeaderService(DocumentRepository documentRepository, AuthorRepository authorRepository, ReferenceService referenceService) { this.documentRepository = documentRepository; this.authorRepository = authorRepository; + this.referenceService = referenceService; } public void processHeader(String header){ @@ -85,6 +87,10 @@ public class HeaderService { dokument.setAuthors(savedAuthors); this.documentRepository.save(dokument); + + // set the document, which has the list of references + referenceService.setFromDocument(dokument); + } private String parseHeaderFields(String header, String field){ diff --git a/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java b/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java index 836614114263c7adfd35239d4399afc3203c9b81..03a20e4b4f92a287f4c2701b982d490f368021e8 100644 --- a/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java +++ b/src/main/java/com/dre0059/articleprocessor/service/ReferenceService.java @@ -5,14 +5,140 @@ import com.dre0059.articleprocessor.repository.*; import jakarta.transaction.Transactional; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.stereotype.Service; +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.xpath.XPath; +import javax.xml.xpath.XPathConstants; +import javax.xml.xpath.XPathExpressionException; +import javax.xml.xpath.XPathFactory; +import java.io.IOException; +import java.io.InputStream; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +// TODO : +// 1. uložiť prepojenie toDocument a fromDocument do tabuľky referencie +// 2. vytiahnuť orderNumber z referencie (toto riešiť cez GROBID) +// 3. aktuálne sa mi toDocument ukladá vždy ako nový.. ja ho potrebujem vyhľadať a na základe toho uložiť alebo prepojiť @Service public class ReferenceService { + + private final DocumentRepository documentRepository; + private final AuthorRepository authorRepository; + + //private String title; + private Integer year; + private String doi; + private String abstractText; + private Integer pages; + private String publisher; + + private String author; + private List<Author> authorList = new ArrayList<>(); + + private Dokument fromDocument; + private Dokument toDocument; + + @Autowired - private ReferenceRepository referenceRepository; + public ReferenceService(DocumentRepository documentRepository, AuthorRepository authorRepository) { + this.documentRepository = documentRepository; + this.authorRepository = authorRepository; + } + + public void setFromDocument(Dokument fromDocument) { + this.fromDocument = fromDocument; + } + + public void extractReferences(String xmlTeiReferences) { + List<Author> databaseAuthors = this.authorRepository.findAll(); + Map<String, Author> authorMap = new HashMap<>(); + + for (Author author : databaseAuthors) { + String key = author.getLastname().toLowerCase() + "," + author.getFirstname().toLowerCase(); + authorMap.put(key, author); + } + + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + DocumentBuilder builder = factory.newDocumentBuilder(); + + InputSource inputSource = new InputSource(new StringReader(xmlTeiReferences)); + Document doc = builder.parse(inputSource); + + XPathFactory xpathFactory = XPathFactory.newInstance(); + XPath xpath = xpathFactory.newXPath(); + xpath.setNamespaceContext(new TEINamespaceContext()); - @Transactional - public Reference saveReference(Reference reference) { - return referenceRepository.save(reference); + NodeList biblNodes = (NodeList) xpath.evaluate("//tei:biblStruct", doc, XPathConstants.NODESET); + + for (int i = 0; i < biblNodes.getLength(); i++) { + Node biblNode = biblNodes.item(i); + Dokument toDokument = new Dokument(); + + // Extract title - toDocument + String title = xpath.evaluate(".//tei:title[@level='m' or @level='a']", biblNode); + toDokument.setTitle(title); + + // Extract authors + NodeList authorNodes = (NodeList) xpath.evaluate(".//tei:author/tei:persName", biblNode, XPathConstants.NODESET); + List<Author> authors = new ArrayList<>(); + + for (int j = 0; j < authorNodes.getLength(); j++) { + Node authorNode = authorNodes.item(j); + + String firstName = xpath.evaluate(".//tei:forename", authorNode); + String lastName = xpath.evaluate(".//tei:surname", authorNode); + + String authorKey = lastName.toLowerCase() + "," + firstName.toLowerCase(); + + if (authorMap.containsKey(authorKey)) { + authors.add(authorMap.get(authorKey)); + System.out.println("Author: " + authorMap.get(authorKey) + " already exists in database."); + } else { + Author newAuthor = new Author(firstName, lastName); + authors.add(newAuthor); + authorMap.put(authorKey, newAuthor); + } + + toDokument.setAuthors(authors); + + // Extract year of publication + String yearStr = xpath.evaluate(".//tei:date[@type='published']/@when", biblNode); + if (yearStr != null && !yearStr.isEmpty()) { + try { + toDokument.setPublicationYear(Integer.valueOf(yearStr)); + } catch (NumberFormatException e) { + System.out.println("Error during converting year." + yearStr); + } + } + + // Extract publisher + String publisher = xpath.evaluate(".//tei:publisher", biblNode); + toDokument.setPublisher(publisher); + + this.documentRepository.save(toDokument); + this.authorRepository.saveAll(authors); + + } + + } + } catch (Exception e) { + e.printStackTrace(); + } + } } -} + + diff --git a/src/main/java/com/dre0059/articleprocessor/service/TEINamespaceContext.java b/src/main/java/com/dre0059/articleprocessor/service/TEINamespaceContext.java new file mode 100644 index 0000000000000000000000000000000000000000..4902424f6b2d816643fcb6ed191499336dbcc971 --- /dev/null +++ b/src/main/java/com/dre0059/articleprocessor/service/TEINamespaceContext.java @@ -0,0 +1,19 @@ +package com.dre0059.articleprocessor.service; + +import javax.xml.namespace.NamespaceContext; +import java.util.Iterator; + +public class TEINamespaceContext implements NamespaceContext { + @Override + public String getNamespaceURI(String prefix) { + if ("tei".equals(prefix)) { + return "http://www.tei-c.org/ns/1.0"; + } + return null; + } + + @Override + public String getPrefix(String namespaceURI) { return null; } + @Override + public Iterator<String> getPrefixes(String namespaceURI) { return null; } +} diff --git a/src/main/java/com/dre0059/articleprocessor/service/TEIparser.java b/src/main/java/com/dre0059/articleprocessor/service/TEIparser.java index dbe9378219094cd9786e6aa26c2f0b3ea58c3a06..8e631c7f785bca156ad2ed4317430ea607033129 100644 --- a/src/main/java/com/dre0059/articleprocessor/service/TEIparser.java +++ b/src/main/java/com/dre0059/articleprocessor/service/TEIparser.java @@ -1,4 +1,4 @@ -package com.dre0059.articleprocessor.service; +/*package com.dre0059.articleprocessor.service; import com.dre0059.articleprocessor.model.Author; import com.dre0059.articleprocessor.model.Dokument; @@ -60,4 +60,6 @@ public class TEIparser { } } -} \ No newline at end of file +} + + */ \ No newline at end of file