From 3a86d2a4b14a2d20cbcbb5872c66915381f185ce Mon Sep 17 00:00:00 2001 From: Mesharo <Hecko97@seznam.cz> Date: Thu, 10 Oct 2024 19:08:14 +0200 Subject: [PATCH] cleanup, filepaths, comments --- main.py | 218 +++++++++++++++++++++++++++++++++++++++++-------------- tests.py | 47 +++++++++++- 2 files changed, 208 insertions(+), 57 deletions(-) diff --git a/main.py b/main.py index a2d9638..793f8f5 100644 --- a/main.py +++ b/main.py @@ -1,13 +1,15 @@ import re from bigxml import Parser, xml_handle_element -import time -from tests import * - -# wanna find postgresql and postgresql-version tags -def get_postgresql_tags(file_name: str) -> list: +def get_postgresql_tags(input_filepath_tags_xml: str) -> list: + """Return a list of wanted postgresql tags. + + Argument: + input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump. + """ + result = [] - file = open(file_name, 'r', encoding='utf8') + file = open(input_filepath_tags_xml, 'r', encoding='utf8') for row in file: id_tagname = re.search('<row Id="(.+?)" TagName="(.+?)"', row) @@ -38,33 +40,55 @@ def get_postgresql_tags(file_name: str) -> list: @xml_handle_element('posts', 'row') def handler(node): + """Handler for BigXML parser. + + Function which parses the XML row and + yields user-defined structure. + """ + post_type_id = node.attributes['PostTypeId'] id = node.attributes['Id'] body = node.attributes['Body'] - try: - tags = node.attributes['Tags'] - except KeyError: - tags = 'N/A' - - try: - parent_id = node.attributes['ParentId'] - except KeyError: - parent_id = 'N/A' - - try: - accepted_answer_id = node.attributes['AcceptedAnswerId'] - except KeyError: - accepted_answer_id = 'N/A' - - yield { - 'id': id, - 'body': body, - 'tags': tags, - 'parent_id': parent_id, - 'accepted_answer_id': accepted_answer_id - } + # question + if post_type_id == '1': + try: + tags = node.attributes['Tags'] + + yield { + 'post_type_id': post_type_id, + 'id': id, + 'body': body, + 'tags': tags, + } + except KeyError: + yield 'N/A' + + # answer + elif post_type_id == '2': + try: + parent_id = node.attributes['ParentId'] + + yield { + 'post_type_id': post_type_id, + 'id': id, + 'body': body, + 'parent_id': parent_id + } + except KeyError: + yield 'N/A' + else: + yield 'N/A' def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> bool: + """ + Check for postgresql tags. + Return true when found. + + Arguments: + postgresql_tags -- list of wanted postgresql tags ('postgresql', versions). + question_tags -- string of the current question's tags. + """ + correct = False for tag in postgresql_tags: if tag in question_tags: @@ -72,23 +96,39 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> break return correct -# filter out wanted questions and save them in a separate file for quicker access -def filter_postgresql_questions(file_name: str) -> None: - postgresql_tags = get_postgresql_tags('D:\stackoverflow.com\Tags.xml') - output_file_questions = open('D:\postgresql_questions.txt', 'a', encoding='utf8') - output_file_all_answers = open('D:\\all_answers.txt', 'a', encoding='utf8') +def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None: + """Filter out postgresql questions. + + Read posts from a file by doses (1GB). + Let parser and handler yield row values in expected structure. + Distinguish between questions and answers. + Check for postgresql tags, save if found. + + Arguments: + input_filepath_posts_xml -- path to the Posts.xml from Stackoverflow's data dump. + input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump. + output_filepath_questions -- path where we save filtered questions. + output_filepath_answers -- path where we save all answers. + """ + + postgresql_tags = get_postgresql_tags(input_filepath_tags_xml) + output_file_questions = open(output_filepath_questions, 'a', encoding='utf8') + output_file_all_answers = open(output_filepath_answers, 'a', encoding='utf8') - with open(file_name, 'rb') as XML_file: + with open(input_filepath_posts_xml, 'rb') as XML_file: rows = XML_file.readlines(1000000000) - # todo - encoding to utf8 for each readlines (insert header into list) - while len(rows) > 1: - rows.append(b'</posts>') + while len(rows) > 2: + if rows[len(rows) - 1] != b'</posts>': + rows.append(b'</posts>') for item in Parser(rows).iter_from(handler): # answer - if item['tags'] == 'N/A': + if item == 'N/A': + continue + + if item['post_type_id'] == '2': output_file_all_answers.write(str(item) + '\n') continue @@ -97,41 +137,81 @@ def filter_postgresql_questions(file_name: str) -> None: output_file_questions.write(str(item) + '\n') rows = XML_file.readlines(1000000000) - rows.insert(0, b'<posts>') + rows.insert(0, b'<?xml version="1.0" encoding="utf-8"?>') + rows.insert(1, b'<posts>') output_file_questions.close() output_file_all_answers.close() -def get_questions() -> list: - input_file_questions = open('D:\postgresql_questions.txt', 'r', encoding='utf8') +def get_questions(input_filepath_questions) -> list: + """ + Read postgresql questions into a list, represented as tuples. + + Argument: + input_filepath_questions -- path to the file with postgresql questions. + """ + + input_file_questions = open(input_filepath_questions, 'r', encoding='utf8') result = [] for row in input_file_questions: - id_body = re.search("{'id': '(.+?)', 'body': '(.+?)'", row) + id_body = re.search("{'post_type_id': '1', 'id': '(.+?)', 'body': (.+?), 'tags'", row) if not id_body: print('Failed to read question from file!') continue tuple_id_body = id_body.group(1, 2) - result.append(tuple_id_body) + result.append(('1',) + tuple_id_body) input_file_questions.close() return result -# not tested -def link_questions_with_answers(questions: list) -> dict: - input_file_answers = open('D:\\all_answers.txt', 'r', encoding='utf8') +def save_linked(output_filepath: str, linked_questions_answers: dict) -> None: + """ + Write linked postgresql questions with their answers into a file. + + Arguments: + output_filepath -- path where we want to save everything. + linked_questions_answers -- dictionary with questions as keys and lists of answers as values + """ + + output_file = open(output_filepath, 'a', encoding='utf8') + + for question, answers in linked_questions_answers.items(): + output_file.write(str(question) + '\n') + + for answer in answers: + output_file.write(str(answer) + '\n') + + output_file.close() + +def link_questions_with_answers(input_filepath_answers: str, questions: list) -> None: + """Link postgresql questions with corresponding answers. + + Sort postgresql question based on their ID. + Load answers, check their parent_id attribute. + Link postgresql answers to their questions. + Call function save_linked(result: dict) to save all into a file. + + Arguments: + input_filepath_answers -- path to the file with answers. + questions -- list with postgresql questions, represented as tuples. + """ + + input_file_answers = open(input_filepath_answers, 'r', encoding='utf8') result = {} + questions.sort(key = lambda x: int(x[1])) + for tuple_key in questions: - result[tuple_key] = [] + result[tuple_key[1]] = [] rows = input_file_answers.readlines(1000000000) while len(rows) > 0: for answer in rows: - id_body_parent = re.search("{'id': '(.+?)', 'body': '(.+?)', 'tags': 'N/A', 'parent_id': '(.+?)'", answer) + id_body_parent = re.search("{'post_type_id': '2', 'id': '(.+?)', 'body': (.+?), 'parent_id': '(.+?)'", answer) if not id_body_parent: print('Failed to read answer from file!') @@ -139,20 +219,46 @@ def link_questions_with_answers(questions: list) -> dict: tuple_id_body_parent = id_body_parent.group(1, 2, 3) - for key in result: - if key[0] == tuple_id_body_parent[2]: - result[key].append(tuple_id_body_parent) - break + if result.get(tuple_id_body_parent[2]) is not None: + result[tuple_id_body_parent[2]].append(('2',) + tuple_id_body_parent) rows = input_file_answers.readlines(1000000000) input_file_answers.close() + + result = dict(zip(questions, result.values())) + save_linked(result) + return result -#{'id': id, 'body': body, 'tags': tags, 'parent_id': parent_id, 'accepted_answer_id': accepted_answer_id} +def retrieve_linked(input_filepath_linked: str) -> dict: + """Get questions and answers. + + Read saved postgresql questions with their corresponding + answers from a file into a dictionary. + + Argument: + input_filepath_linked -- path to the file with questions and answers. + """ + + input_file = open(input_filepath_linked, 'r', encoding='utf8') + result = {} + row = input_file.readline() + current = -1 + + while row: + row_tuple = eval(row) + if row_tuple[0] == '1': + current = row_tuple[1::] + result[current] = [] + else: + result[current].append(row_tuple[1::]) + + row = input_file.readline() + + input_file.close() + return result if __name__ == "__main__": - # filter_postgresql_questions('D:\stackoverflow.com\Posts.xml') - questions = get_questions() - # questions_answers = link_questions_with_answers(get_questions) \ No newline at end of file + questions_answers = retrieve_linked('D:\\final.txt') \ No newline at end of file diff --git a/tests.py b/tests.py index d28f1aa..4cf7bb3 100644 --- a/tests.py +++ b/tests.py @@ -31,4 +31,49 @@ def test_dict_tuple_key() -> None: print('No match') print(result) - print(type(result)) \ No newline at end of file + print(type(result)) + +def wtha(): + with open('D:\\all_answers.txt', 'r', encoding='utf8') as f: + for row in f: + row_tmp = eval(row) + if row_tmp['id'] == '4416': + print(row) + +def testing(): + with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'w', encoding='utf8') as output_fule: + output_fule.write(str(('1', 'Hello1')) + '\n') + output_fule.write(str(('2', 'Hello2')) + '\n') + output_fule.write(str(('2', 'Hello3')) + '\n') + output_fule.write(str(('1', 'Hello4')) + '\n') + output_fule.write(str(('2', 'Hello5')) + '\n') + + result = {} + with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'r', encoding='utf8') as input_fule: + row = input_fule.readline() + current = eval(row)[1] + while row: + if (eval(row)[0] == '1'): + current = eval(row)[1] + result[current] = [] + else: + result[current].append(eval(row)) + row = input_fule.readline() + + print(result) + +def testing2(): + questions = [('1', 'hello', 'yush')] + mydict = { + ('1', '2'): "any", + ('2', '3'): "any2", + ('3', '4'): "any3", + ('4', '5'): "any4", + ('5', '6'): "any5", + } + + print(mydict) + + mydict = dict(zip(questions, mydict.values())) + + print(mydict) -- GitLab