Skip to content
Snippets Groups Projects
Commit 6b367d34 authored by dre0059's avatar dre0059
Browse files

Save documents and authors in reference list of the PDF file

parent 86f11116
No related merge requests found
......@@ -3,6 +3,7 @@ package com.dre0059.articleprocessor.controller;
import com.dre0059.articleprocessor.GrobidClient;
import com.dre0059.articleprocessor.service.HeaderService;
import com.dre0059.articleprocessor.service.ReferenceService;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
......@@ -18,18 +19,12 @@ import java.io.IOException;
public class FileUploadController {
private final GrobidClient grobidClient;
private final HeaderService headerService;
private final ReferenceService referenceService;
//private final DocumentRepository metadataRepository;
//private final ReferenceRepository referenceRepository;
//private final MetadataParser metadataParser;
//private static final Logger logger = LoggerFactory.getLogger(FileUploadController.class);
public FileUploadController(GrobidClient grobidClient, HeaderService headerService/*, DocumentRepository metadataRepository, ReferenceRepository referenceRepository, MetadataParser metadataParser*/) {
public FileUploadController(GrobidClient grobidClient, HeaderService headerService, ReferenceService referenceService) {
this.grobidClient = grobidClient;
this.headerService = headerService;
// this.metadataRepository = metadataRepository;
//this.referenceRepository = referenceRepository;
//this.metadataParser = metadataParser;
this.referenceService = referenceService;
}
@GetMapping("/upload")
......@@ -61,9 +56,10 @@ public class FileUploadController {
String references = grobidClient.processReferences(tmpFile);
headerService.processHeader(header);
referenceService.extractReferences(references);
System.out.println(header);
//System.out.println(references);
System.out.println(references);
tmpFile.delete();
......
......@@ -22,6 +22,7 @@ public class HeaderService {
private final DocumentRepository documentRepository;
private final AuthorRepository authorRepository;
private final ReferenceService referenceService;
//public Dokument(String title, Integer year, String doi, String abstractText, Integer pages, String publisher) {
......@@ -36,9 +37,10 @@ public class HeaderService {
private String author;
@Autowired
public HeaderService(DocumentRepository documentRepository, AuthorRepository authorRepository) {
public HeaderService(DocumentRepository documentRepository, AuthorRepository authorRepository, ReferenceService referenceService) {
this.documentRepository = documentRepository;
this.authorRepository = authorRepository;
this.referenceService = referenceService;
}
public void processHeader(String header){
......@@ -85,6 +87,10 @@ public class HeaderService {
dokument.setAuthors(savedAuthors);
this.documentRepository.save(dokument);
// set the document, which has the list of references
referenceService.setFromDocument(dokument);
}
private String parseHeaderFields(String header, String field){
......
......@@ -5,14 +5,140 @@ import com.dre0059.articleprocessor.repository.*;
import jakarta.transaction.Transactional;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
// TODO :
// 1. uložiť prepojenie toDocument a fromDocument do tabuľky referencie
// 2. vytiahnuť orderNumber z referencie (toto riešiť cez GROBID)
// 3. aktuálne sa mi toDocument ukladá vždy ako nový.. ja ho potrebujem vyhľadať a na základe toho uložiť alebo prepojiť
@Service
public class ReferenceService {
private final DocumentRepository documentRepository;
private final AuthorRepository authorRepository;
//private String title;
private Integer year;
private String doi;
private String abstractText;
private Integer pages;
private String publisher;
private String author;
private List<Author> authorList = new ArrayList<>();
private Dokument fromDocument;
private Dokument toDocument;
@Autowired
private ReferenceRepository referenceRepository;
public ReferenceService(DocumentRepository documentRepository, AuthorRepository authorRepository) {
this.documentRepository = documentRepository;
this.authorRepository = authorRepository;
}
public void setFromDocument(Dokument fromDocument) {
this.fromDocument = fromDocument;
}
public void extractReferences(String xmlTeiReferences) {
List<Author> databaseAuthors = this.authorRepository.findAll();
Map<String, Author> authorMap = new HashMap<>();
for (Author author : databaseAuthors) {
String key = author.getLastname().toLowerCase() + "," + author.getFirstname().toLowerCase();
authorMap.put(key, author);
}
try {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
factory.setNamespaceAware(true);
DocumentBuilder builder = factory.newDocumentBuilder();
InputSource inputSource = new InputSource(new StringReader(xmlTeiReferences));
Document doc = builder.parse(inputSource);
XPathFactory xpathFactory = XPathFactory.newInstance();
XPath xpath = xpathFactory.newXPath();
xpath.setNamespaceContext(new TEINamespaceContext());
@Transactional
public Reference saveReference(Reference reference) {
return referenceRepository.save(reference);
NodeList biblNodes = (NodeList) xpath.evaluate("//tei:biblStruct", doc, XPathConstants.NODESET);
for (int i = 0; i < biblNodes.getLength(); i++) {
Node biblNode = biblNodes.item(i);
Dokument toDokument = new Dokument();
// Extract title - toDocument
String title = xpath.evaluate(".//tei:title[@level='m' or @level='a']", biblNode);
toDokument.setTitle(title);
// Extract authors
NodeList authorNodes = (NodeList) xpath.evaluate(".//tei:author/tei:persName", biblNode, XPathConstants.NODESET);
List<Author> authors = new ArrayList<>();
for (int j = 0; j < authorNodes.getLength(); j++) {
Node authorNode = authorNodes.item(j);
String firstName = xpath.evaluate(".//tei:forename", authorNode);
String lastName = xpath.evaluate(".//tei:surname", authorNode);
String authorKey = lastName.toLowerCase() + "," + firstName.toLowerCase();
if (authorMap.containsKey(authorKey)) {
authors.add(authorMap.get(authorKey));
System.out.println("Author: " + authorMap.get(authorKey) + " already exists in database.");
} else {
Author newAuthor = new Author(firstName, lastName);
authors.add(newAuthor);
authorMap.put(authorKey, newAuthor);
}
toDokument.setAuthors(authors);
// Extract year of publication
String yearStr = xpath.evaluate(".//tei:date[@type='published']/@when", biblNode);
if (yearStr != null && !yearStr.isEmpty()) {
try {
toDokument.setPublicationYear(Integer.valueOf(yearStr));
} catch (NumberFormatException e) {
System.out.println("Error during converting year." + yearStr);
}
}
// Extract publisher
String publisher = xpath.evaluate(".//tei:publisher", biblNode);
toDokument.setPublisher(publisher);
this.documentRepository.save(toDokument);
this.authorRepository.saveAll(authors);
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
package com.dre0059.articleprocessor.service;
import javax.xml.namespace.NamespaceContext;
import java.util.Iterator;
public class TEINamespaceContext implements NamespaceContext {
@Override
public String getNamespaceURI(String prefix) {
if ("tei".equals(prefix)) {
return "http://www.tei-c.org/ns/1.0";
}
return null;
}
@Override
public String getPrefix(String namespaceURI) { return null; }
@Override
public Iterator<String> getPrefixes(String namespaceURI) { return null; }
}
package com.dre0059.articleprocessor.service;
/*package com.dre0059.articleprocessor.service;
import com.dre0059.articleprocessor.model.Author;
import com.dre0059.articleprocessor.model.Dokument;
......@@ -60,4 +60,6 @@ public class TEIparser {
}
}
}
\ No newline at end of file
}
*/
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment