From bed619e3403ddbba2797c21a1c277c0bd63efac7 Mon Sep 17 00:00:00 2001 From: Mesharo <Hecko97@seznam.cz> Date: Sat, 21 Sep 2024 09:33:27 +0200 Subject: [PATCH] filtering postgresql questions --- README.md | 10 ++++++- main.py | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 main.py diff --git a/README.md b/README.md index f9aed02..3fa9297 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,10 @@ # SQL-schemas -Bachelor's thesis - Database schema extraction from a collection of questions on StackOverflow using SQL analysis. + +Extrakce schĂ©matu databáze z kolekce otázek na StackOverflow s vyuĹľitĂm analĂ˝zy SQL + +StackOverflow je jedna z nejvÄ›tšĂch question and answer sĂtĂ. Je zde statisĂce otázek s tĂ©matem SQL a velkĂ© mnoĹľstvĂ SQL dotazĹŻ. Pro dalšà analĂ˝zu SQL dotazĹŻ by byla vhodná znalost schĂ©matu databáze, kterĂ˝ch se dotazy tĂ˝kajĂ. SchĂ©ma bĂ˝vá v textu uvedeno v rĹŻznĂ˝ch formách. CĂlem tĂ©to práce je zautomatizovat ÄŤtenĂ schĂ©mat pro co nejvÄ›tšà mnoĹľstvĂ SQL. + +- SeznámenĂ se s moĹľnostmi parsrovánĂ SQL a extrakce schĂ©matu z SQL dotazu. +- Implementace knihovny, která pro vstupnĂ SQL dotaz vrátĂ seznam tabulek spolu se seznamem atributĹŻ a pĹ™ĂpadnÄ› i s návrhy datovĂ˝ch typĹŻ jednotlivĂ˝ch atributĹŻ. +- PĹ™Ăprava trĂ©novacĂ kolekce pro umÄ›lou inteligenci, kde pro StackOverflow dotaz bude uveden oÄŤekávanĂ˝ DDL skript. Kolekce bude mĂt alespoĹ sto prvkĹŻ. +- Implementace ověřovacĂ aplikace, která pro DDL skript vykoná dotaz. diff --git a/main.py b/main.py new file mode 100644 index 0000000..5933ae8 --- /dev/null +++ b/main.py @@ -0,0 +1,80 @@ +import re +from bigxml import Parser, xml_handle_element +import time + +# wanna find postgresql and postgresql-version tags +def get_postgresql_tags(file_name: str) -> list: + result = [] + file = open(file_name, 'r', encoding='utf8') + + for row in file: + id_tagname = re.search('<row Id="(.+?)" TagName="(.+?)"', row) + + if not id_tagname: + # Either meta data or <tags> element. + continue + + tagname = id_tagname.group(2) + + if tagname != 'postgresql': + postgresql_version = re.search('postgresql-(.+?)', tagname) + + if not postgresql_version: + continue + + try: + float(postgresql_version.group(1)) + except ValueError: + # Not a version of postgresql. + continue + + result.append(tagname) + + file.close() + return result + + +@xml_handle_element('posts', 'row') +def handler(node): + id = node.attributes['Id'] + body = node.attributes['Body'] + + try: + tags = node.attributes['Tags'] + except KeyError: + tags = 'N/A' + + try: + accepted_answer_id = node.attributes['AcceptedAnswerId'] + except KeyError: + accepted_answer_id = 'N/A' + + yield { + 'id': id, + 'body': body, + 'tags': tags, + 'accepted_answer_id': accepted_answer_id + } + + +# filter out wanted questions and save them in a separate file for quicker access +def filter_postgresql_questions(file_name: str) -> None: + postgresql_tags = get_postgresql_tags('D:\stackoverflow.com\Tags.xml') + output_file = open('D:\postgresql_questions.txt', 'a', encoding='utf8') + count = 0 + + start = time.time() + with open(file_name, 'rb') as XML_file: + for item in Parser(XML_file).iter_from(handler): + for correct_tag in postgresql_tags: + if correct_tag in item['tags']: + output_file.write(str(item) + '\n') + print(count) + count += 1 + end = time.time() + print(f'Time elapsed: {(end - start) / 60}') + + output_file.close() + +if __name__ == "__main__": + filter_postgresql_questions('D:\stackoverflow.com\Posts.xml') \ No newline at end of file -- GitLab