Skip to content
Snippets Groups Projects
Commit bc07103b authored by dre0059's avatar dre0059
Browse files

Validating DOCMENT - avoid duplicities

parent 2aac3d37
Branches
No related merge requests found
......@@ -2,36 +2,16 @@ package com.dre0059.articleprocessor.controller;
import com.dre0059.articleprocessor.GrobidClient;
import com.dre0059.articleprocessor.service.HeaderService;
import com.dre0059.articleprocessor.service.MetadataParser;
import com.dre0059.articleprocessor.repository.DocumentRepository;
import com.dre0059.articleprocessor.repository.ReferenceRepository;
import com.dre0059.articleprocessor.service.TEIparser;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.ui.Model;
import org.springframework.web.bind.annotation.*;
import org.springframework.web.multipart.MultipartFile;
import reactor.core.publisher.Mono;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
// TODO :
// 1. ✅ nefunguje mi správne uloženie článku, pokiaľ už článok v DBS je, aktuálne mi vyhodí len ERROR že nemožno správne spracovať
// 2. ✅ !!! uloženie referencií do databázy
// 3. prepojím referenciu s uloženými článkami ???
// 4. viac spraviť program USER-FRIENDLY - výpis že spracovávam document, výpis že dokument už je uložený, výpis že dokument sa uložil a vypíšem metadata pre overenie
// 5. nesprávne vyťahovanie referencií - referencie ktoré sa odkazujú na nejaký web, nie sú spracované
@Controller
@RequestMapping("/api/grobid")
......
......@@ -14,6 +14,28 @@ import java.util.Optional;
@Repository
public interface DocumentRepository extends JpaRepository<Dokument, Long> {
@Query("SELECT COUNT(d) > 0 FROM Dokument d JOIN d.authors a WHERE d.title = :title AND a IN :authors")
boolean existsByTitleAndAuthorsIn(@Param("title") String title, @Param("authors") List<Author> authors);
@Query(
"SELECT COUNT(d) > 0 " +
"FROM Dokument d " +
"JOIN d.authors a " +
"WHERE d.title = :title " +
"AND a.lastName IN :lastNames"
)
boolean existsByTitleAndAuthorsIn(@Param("title") String title, @Param("lastNames") List<String> lastNames);
}
/*
// save only if all authors are the same
@Query("""
SELECT COUNT(d) > 0
FROM Dokument d
WHERE d.title = :title
AND SIZE(d.authors) = :authorCount
AND EXISTS (
SELECT 1 FROM Dokument d2 JOIN d2.authors a2
WHERE d2.id = d.id AND a2 IN :authors
)
""")
boolean existsByTitleAndAuthors(@Param("title") String title, @Param("authors") List<Author> authors, @Param("authorCount") int authorCount);
*/
\ No newline at end of file
......@@ -40,31 +40,47 @@ public class HeaderService {
public void processHeader(String header){
this.title = this.parseHeaderFields(header, "title");
this.doi = this.parseHeaderFields(header, "doi");
this.abstractText = this.parseHeaderFields(header, "abstract");
this.publisher = this.parseHeaderFields(header, "publisher");
if(this.parseHeaderFields(header, "year").equals("Not found")){
this.year = 0;
if(!this.parseHeaderFields(header, "doi").equals("Not found")){
this.doi = this.parseHeaderFields(header, "doi");
}
if(this.parseHeaderFields(header, "pages").equals("Not found")){
this.pages = 0;
if(!this.parseHeaderFields(header, "abstract").equals("Not found")){
this.abstractText = this.parseHeaderFields(header, "abstract");
}
if(!this.parseHeaderFields(header, "publisher").equals("Not found")){
this.publisher = this.parseHeaderFields(header, "publisher");
}
if(!this.parseHeaderFields(header, "year").equals("Not found")){
String yearString = this.parseHeaderFields(header, "year");
this.year = Integer.parseInt(yearString);
}
if(!this.parseHeaderFields(header, "pages").equals("Not found")){
String pagesString = this.parseHeaderFields(header, "pages");
this.pages = Integer.parseInt(pagesString);
}
this.author = this.parseHeaderFields(header, "author");
if(!this.author.equals("Not found")){
authorList = this.saveAuthorNameAndSurname(this.author);
}
authorRepository.saveAll(authorList);
List<String> authorLastNames= authorList.stream().map(Author::getLastname).toList();
System.out.println("Author list before checking duplicity: " + authorList);
System.out.println("Author last names before checking duplicity: " + authorLastNames);
// tu dostávam error :
boolean headerDuplicity = documentRepository.existsByTitleAndAuthorsIn(title, authorLastNames);
// check duplicity of the document
if(documentRepository.existsByTitleAndAuthorsIn(title, authorList)){
if(headerDuplicity){
System.out.println("Document with this title and authors already exist");
return;
}
authorRepository.saveAll(authorList);
Dokument dokument = new Dokument(title, year, doi, pages, publisher);
dokument.setAuthors(authorList);
......
package com.dre0059.articleprocessor.service;
import com.dre0059.articleprocessor.model.*;
import com.dre0059.articleprocessor.repository.*;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Service;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Service
public class MetadataParser {
@Autowired
private DocumentRepository documentRepository;
@Autowired
private AuthorRepository authorRepository;
@Autowired
public MetadataParser(DocumentRepository documentRepository, AuthorRepository authorRepository) {
this.documentRepository = documentRepository;
this.authorRepository = authorRepository;
}
public Dokument parseBibTeX(String bibtexString) {
// Regular expression pre získanie hodnôt z BibTeX formátu
Pattern pattern = Pattern.compile("@.*?\\{.*?,\\s*author\\s*=\\s*\\{(.*?)\\},\\s*title\\s*=\\s*\\{(.*?)\\},\\s*doi\\s*=\\s*\\{(.*?)\\},\\s*abstract\\s*=\\s*\\{(.*?)\\}");
Matcher matcher = pattern.matcher(bibtexString);
if (matcher.find()) {
String authorsRaw = matcher.group(1);
String title = matcher.group(2);
String doi = matcher.group(3);
String abstractText = matcher.group(4);
List<Author> authors = parseAuthors(authorsRaw);
Dokument document = new Dokument(title, null, doi, null, null);
document.setAuthors(authors);
//documentRepository.save(document);
for (Author author : authors) {
authorRepository.save(author);
}
return document;
}
return null;
}
private List<Author> parseAuthors(String authorsRaw) {
List<Author> authors = new ArrayList<>();
String[] authorNames = authorsRaw.split(" and ");
for (String fullName : authorNames) {
String[] nameParts = fullName.trim().split("\\s+", 2);
if (nameParts.length == 2) {
authors.add(new Author(nameParts[1], nameParts[0])); // Priezvisko, Meno
} else {
authors.add(new Author(nameParts[0], "")); // Ak meno nemá priezvisko
}
}
return authors;
}
}
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment