From 3a86d2a4b14a2d20cbcbb5872c66915381f185ce Mon Sep 17 00:00:00 2001
From: Mesharo <Hecko97@seznam.cz>
Date: Thu, 10 Oct 2024 19:08:14 +0200
Subject: [PATCH] cleanup, filepaths, comments

---
 main.py  | 218 +++++++++++++++++++++++++++++++++++++++++--------------
 tests.py |  47 +++++++++++-
 2 files changed, 208 insertions(+), 57 deletions(-)

diff --git a/main.py b/main.py
index a2d9638..793f8f5 100644
--- a/main.py
+++ b/main.py
@@ -1,13 +1,15 @@
 import re
 from bigxml import Parser, xml_handle_element
-import time
 
-from tests import *
-
-# wanna find postgresql and postgresql-version tags
-def get_postgresql_tags(file_name: str) -> list:
+def get_postgresql_tags(input_filepath_tags_xml: str) -> list:
+    """Return a list of wanted postgresql tags.
+    
+    Argument:
+    input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
+    """
+    
     result = []
-    file = open(file_name, 'r', encoding='utf8')
+    file = open(input_filepath_tags_xml, 'r', encoding='utf8')
 
     for row in file:
         id_tagname = re.search('<row Id="(.+?)" TagName="(.+?)"', row)
@@ -38,33 +40,55 @@ def get_postgresql_tags(file_name: str) -> list:
 
 @xml_handle_element('posts', 'row')
 def handler(node):
+    """Handler for BigXML parser.
+    
+    Function which parses the XML row and
+    yields user-defined structure.
+    """
+    post_type_id = node.attributes['PostTypeId']
     id = node.attributes['Id']
     body = node.attributes['Body']
 
-    try:
-        tags = node.attributes['Tags']
-    except KeyError:
-        tags = 'N/A'
-
-    try:
-        parent_id = node.attributes['ParentId']
-    except KeyError:
-        parent_id = 'N/A'
-
-    try:
-        accepted_answer_id = node.attributes['AcceptedAnswerId']
-    except KeyError:
-        accepted_answer_id = 'N/A'
-    
-    yield {
-        'id': id,
-        'body': body,
-        'tags': tags,
-        'parent_id': parent_id,
-        'accepted_answer_id': accepted_answer_id
-    }
+    # question
+    if post_type_id == '1':
+        try:
+            tags = node.attributes['Tags']
+
+            yield {
+                'post_type_id': post_type_id,
+                'id': id,
+                'body': body,
+                'tags': tags,
+            }
+        except KeyError:
+            yield 'N/A'
+
+    # answer
+    elif post_type_id == '2':
+        try:
+            parent_id = node.attributes['ParentId']
+
+            yield {
+                'post_type_id': post_type_id,
+                'id': id,
+                'body': body,
+                'parent_id': parent_id
+            }
+        except KeyError:
+            yield 'N/A'
+    else:
+        yield 'N/A'
 
 def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> bool:
+    """
+    Check for postgresql tags.
+    Return true when found.
+
+    Arguments:
+    postgresql_tags -- list of wanted postgresql tags ('postgresql', versions).
+    question_tags -- string of the current question's tags.
+    """
+    
     correct = False
     for tag in postgresql_tags:
         if tag in question_tags:
@@ -72,23 +96,39 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) ->
             break
     return correct
 
-# filter out wanted questions and save them in a separate file for quicker access
-def filter_postgresql_questions(file_name: str) -> None:
-    postgresql_tags = get_postgresql_tags('D:\stackoverflow.com\Tags.xml')
-    output_file_questions = open('D:\postgresql_questions.txt', 'a', encoding='utf8')
-    output_file_all_answers = open('D:\\all_answers.txt', 'a', encoding='utf8')
+def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None:
+    """Filter out postgresql questions.
+    
+    Read posts from a file by doses (1GB).
+    Let parser and handler yield row values in expected structure.
+    Distinguish between questions and answers.
+    Check for postgresql tags, save if found. 
+
+    Arguments:
+    input_filepath_posts_xml -- path to the Posts.xml from Stackoverflow's data dump.
+    input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
+    output_filepath_questions -- path where we save filtered questions.
+    output_filepath_answers -- path where we save all answers.
+    """
+
+    postgresql_tags = get_postgresql_tags(input_filepath_tags_xml)
+    output_file_questions = open(output_filepath_questions, 'a', encoding='utf8')
+    output_file_all_answers = open(output_filepath_answers, 'a', encoding='utf8')
 
-    with open(file_name, 'rb') as XML_file:
+    with open(input_filepath_posts_xml, 'rb') as XML_file:
 
         rows = XML_file.readlines(1000000000)
 
-        # todo - encoding to utf8 for each readlines (insert header into list)
-        while len(rows) > 1:
-            rows.append(b'</posts>')
+        while len(rows) > 2:
+            if rows[len(rows) - 1] != b'</posts>':
+                rows.append(b'</posts>')
             
             for item in Parser(rows).iter_from(handler):
                 # answer
-                if item['tags'] == 'N/A':
+                if item == 'N/A':
+                    continue
+
+                if item['post_type_id'] == '2':
                     output_file_all_answers.write(str(item) + '\n')
                     continue
                 
@@ -97,41 +137,81 @@ def filter_postgresql_questions(file_name: str) -> None:
                     output_file_questions.write(str(item) + '\n')
 
             rows = XML_file.readlines(1000000000)
-            rows.insert(0, b'<posts>')
+            rows.insert(0, b'<?xml version="1.0" encoding="utf-8"?>')
+            rows.insert(1, b'<posts>')
 
     output_file_questions.close()
     output_file_all_answers.close()
 
-def get_questions() -> list:
-    input_file_questions = open('D:\postgresql_questions.txt', 'r', encoding='utf8')
+def get_questions(input_filepath_questions) -> list:
+    """
+    Read postgresql questions into a list, represented as tuples.
+
+    Argument:
+    input_filepath_questions -- path to the file with postgresql questions.
+    """
+    
+    input_file_questions = open(input_filepath_questions, 'r', encoding='utf8')
     result = []
 
     for row in input_file_questions:
-        id_body = re.search("{'id': '(.+?)', 'body': '(.+?)'", row)
+        id_body = re.search("{'post_type_id': '1', 'id': '(.+?)', 'body': (.+?), 'tags'", row)
 
         if not id_body:
             print('Failed to read question from file!')
             continue
         
         tuple_id_body = id_body.group(1, 2)
-        result.append(tuple_id_body)
+        result.append(('1',) + tuple_id_body)
 
     input_file_questions.close()
     return result
 
-# not tested
-def link_questions_with_answers(questions: list) -> dict:
-    input_file_answers = open('D:\\all_answers.txt', 'r', encoding='utf8')
+def save_linked(output_filepath: str, linked_questions_answers: dict) -> None:
+    """
+    Write linked postgresql questions with their answers into a file.
+
+    Arguments:
+    output_filepath -- path where we want to save everything.
+    linked_questions_answers -- dictionary with questions as keys and lists of answers as values
+    """
+
+    output_file = open(output_filepath, 'a', encoding='utf8')
+
+    for question, answers in linked_questions_answers.items():
+        output_file.write(str(question) + '\n')
+
+        for answer in answers:
+            output_file.write(str(answer) + '\n')
+
+    output_file.close()
+
+def link_questions_with_answers(input_filepath_answers: str, questions: list) -> None:
+    """Link postgresql questions with corresponding answers.
+    
+    Sort postgresql question based on their ID.
+    Load answers, check their parent_id attribute.
+    Link postgresql answers to their questions.
+    Call function save_linked(result: dict) to save all into a file.
+
+    Arguments:
+    input_filepath_answers -- path to the file with answers.
+    questions -- list with postgresql questions, represented as tuples.
+    """
+
+    input_file_answers = open(input_filepath_answers, 'r', encoding='utf8')
     result = {}
 
+    questions.sort(key = lambda x: int(x[1]))
+
     for tuple_key in questions:
-        result[tuple_key] = []
+        result[tuple_key[1]] = []
 
     rows = input_file_answers.readlines(1000000000)
 
     while len(rows) > 0:
         for answer in rows:
-            id_body_parent = re.search("{'id': '(.+?)', 'body': '(.+?)', 'tags': 'N/A', 'parent_id': '(.+?)'", answer)
+            id_body_parent = re.search("{'post_type_id': '2', 'id': '(.+?)', 'body': (.+?), 'parent_id': '(.+?)'", answer)
 
             if not id_body_parent:
                 print('Failed to read answer from file!')
@@ -139,20 +219,46 @@ def link_questions_with_answers(questions: list) -> dict:
 
             tuple_id_body_parent = id_body_parent.group(1, 2, 3)
 
-            for key in result:
-                if key[0] == tuple_id_body_parent[2]:
-                    result[key].append(tuple_id_body_parent)
-                    break
+            if result.get(tuple_id_body_parent[2]) is not None:
+                result[tuple_id_body_parent[2]].append(('2',) + tuple_id_body_parent)
 
         rows = input_file_answers.readlines(1000000000)
 
     input_file_answers.close()
+
+    result = dict(zip(questions, result.values()))
+    save_linked(result)
+
     return result
 
-#{'id': id, 'body': body, 'tags': tags, 'parent_id': parent_id, 'accepted_answer_id': accepted_answer_id}
+def retrieve_linked(input_filepath_linked: str) -> dict:
+    """Get questions and answers.
+    
+    Read saved postgresql questions with their corresponding
+    answers from a file into a dictionary.
+
+    Argument:
+    input_filepath_linked -- path to the file with questions and answers.
+    """
+
+    input_file = open(input_filepath_linked, 'r', encoding='utf8')
+    result = {}
 
+    row = input_file.readline()
+    current = -1
+
+    while row:
+        row_tuple = eval(row)
+        if row_tuple[0] == '1':
+            current = row_tuple[1::]
+            result[current] = []
+        else:
+            result[current].append(row_tuple[1::])
+        
+        row = input_file.readline()
+
+    input_file.close()
+    return result
 
 if __name__ == "__main__":
-    # filter_postgresql_questions('D:\stackoverflow.com\Posts.xml')
-    questions = get_questions()
-    # questions_answers = link_questions_with_answers(get_questions)
\ No newline at end of file
+    questions_answers = retrieve_linked('D:\\final.txt')
\ No newline at end of file
diff --git a/tests.py b/tests.py
index d28f1aa..4cf7bb3 100644
--- a/tests.py
+++ b/tests.py
@@ -31,4 +31,49 @@ def test_dict_tuple_key() -> None:
             print('No match')
 
     print(result)
-    print(type(result))
\ No newline at end of file
+    print(type(result))
+
+def wtha():
+    with open('D:\\all_answers.txt', 'r', encoding='utf8') as f:
+        for row in f:
+            row_tmp = eval(row)
+            if row_tmp['id'] == '4416':
+                print(row)
+
+def testing():
+    with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'w', encoding='utf8') as output_fule:
+        output_fule.write(str(('1', 'Hello1')) + '\n')
+        output_fule.write(str(('2', 'Hello2')) + '\n')
+        output_fule.write(str(('2', 'Hello3')) + '\n')
+        output_fule.write(str(('1', 'Hello4')) + '\n')
+        output_fule.write(str(('2', 'Hello5')) + '\n')
+
+    result = {}
+    with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'r', encoding='utf8') as input_fule:
+        row = input_fule.readline()
+        current = eval(row)[1]
+        while row:
+            if (eval(row)[0] == '1'):
+                current = eval(row)[1]
+                result[current] = []
+            else:
+                result[current].append(eval(row))
+            row = input_fule.readline()
+    
+    print(result)
+
+def testing2():
+    questions = [('1', 'hello', 'yush')]
+    mydict = {
+        ('1', '2'): "any",
+        ('2', '3'): "any2",
+        ('3', '4'): "any3",
+        ('4', '5'): "any4",
+        ('5', '6'): "any5",
+    }
+
+    print(mydict)
+
+    mydict = dict(zip(questions, mydict.values()))
+
+    print(mydict)
-- 
GitLab