From 4c1c282c61b3e7a5057564a658c5b535d191c7a3 Mon Sep 17 00:00:00 2001
From: Mesharo <Hecko97@seznam.cz>
Date: Mon, 30 Sep 2024 12:18:36 +0200
Subject: [PATCH] question IDs, matching with answers, testing area

---
 main.py  | 104 ++++++++++++++++++++++++++++++++++++++++++++++++-------
 tests.py |  34 ++++++++++++++++++
 2 files changed, 125 insertions(+), 13 deletions(-)
 create mode 100644 tests.py

diff --git a/main.py b/main.py
index 5933ae8..a2d9638 100644
--- a/main.py
+++ b/main.py
@@ -2,6 +2,8 @@ import re
 from bigxml import Parser, xml_handle_element
 import time
 
+from tests import *
+
 # wanna find postgresql and postgresql-version tags
 def get_postgresql_tags(file_name: str) -> list:
     result = []
@@ -44,6 +46,11 @@ def handler(node):
     except KeyError:
         tags = 'N/A'
 
+    try:
+        parent_id = node.attributes['ParentId']
+    except KeyError:
+        parent_id = 'N/A'
+
     try:
         accepted_answer_id = node.attributes['AcceptedAnswerId']
     except KeyError:
@@ -53,28 +60,99 @@ def handler(node):
         'id': id,
         'body': body,
         'tags': tags,
+        'parent_id': parent_id,
         'accepted_answer_id': accepted_answer_id
     }
 
+def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> bool:
+    correct = False
+    for tag in postgresql_tags:
+        if tag in question_tags:
+            correct = True
+            break
+    return correct
 
 # filter out wanted questions and save them in a separate file for quicker access
 def filter_postgresql_questions(file_name: str) -> None:
     postgresql_tags = get_postgresql_tags('D:\stackoverflow.com\Tags.xml')
-    output_file = open('D:\postgresql_questions.txt', 'a', encoding='utf8')
-    count = 0
+    output_file_questions = open('D:\postgresql_questions.txt', 'a', encoding='utf8')
+    output_file_all_answers = open('D:\\all_answers.txt', 'a', encoding='utf8')
 
-    start = time.time()
     with open(file_name, 'rb') as XML_file:
-        for item in Parser(XML_file).iter_from(handler):
-            for correct_tag in postgresql_tags:
-                if correct_tag in item['tags']:
-                    output_file.write(str(item) + '\n')
-            print(count)
-            count += 1
-    end = time.time()
-    print(f'Time elapsed: {(end - start) / 60}')
 
-    output_file.close()
+        rows = XML_file.readlines(1000000000)
+
+        # todo - encoding to utf8 for each readlines (insert header into list)
+        while len(rows) > 1:
+            rows.append(b'</posts>')
+            
+            for item in Parser(rows).iter_from(handler):
+                # answer
+                if item['tags'] == 'N/A':
+                    output_file_all_answers.write(str(item) + '\n')
+                    continue
+                
+                # question
+                if is_question_tagged_postgresql(postgresql_tags, item['tags']):
+                    output_file_questions.write(str(item) + '\n')
+
+            rows = XML_file.readlines(1000000000)
+            rows.insert(0, b'<posts>')
+
+    output_file_questions.close()
+    output_file_all_answers.close()
+
+def get_questions() -> list:
+    input_file_questions = open('D:\postgresql_questions.txt', 'r', encoding='utf8')
+    result = []
+
+    for row in input_file_questions:
+        id_body = re.search("{'id': '(.+?)', 'body': '(.+?)'", row)
+
+        if not id_body:
+            print('Failed to read question from file!')
+            continue
+        
+        tuple_id_body = id_body.group(1, 2)
+        result.append(tuple_id_body)
+
+    input_file_questions.close()
+    return result
+
+# not tested
+def link_questions_with_answers(questions: list) -> dict:
+    input_file_answers = open('D:\\all_answers.txt', 'r', encoding='utf8')
+    result = {}
+
+    for tuple_key in questions:
+        result[tuple_key] = []
+
+    rows = input_file_answers.readlines(1000000000)
+
+    while len(rows) > 0:
+        for answer in rows:
+            id_body_parent = re.search("{'id': '(.+?)', 'body': '(.+?)', 'tags': 'N/A', 'parent_id': '(.+?)'", answer)
+
+            if not id_body_parent:
+                print('Failed to read answer from file!')
+                continue
+
+            tuple_id_body_parent = id_body_parent.group(1, 2, 3)
+
+            for key in result:
+                if key[0] == tuple_id_body_parent[2]:
+                    result[key].append(tuple_id_body_parent)
+                    break
+
+        rows = input_file_answers.readlines(1000000000)
+
+    input_file_answers.close()
+    return result
+
+#{'id': id, 'body': body, 'tags': tags, 'parent_id': parent_id, 'accepted_answer_id': accepted_answer_id}
+
 
 if __name__ == "__main__":
-    filter_postgresql_questions('D:\stackoverflow.com\Posts.xml')
\ No newline at end of file
+    # filter_postgresql_questions('D:\stackoverflow.com\Posts.xml')
+    questions = get_questions()
+    # questions_answers = link_questions_with_answers(get_questions)
\ No newline at end of file
diff --git a/tests.py b/tests.py
new file mode 100644
index 0000000..d28f1aa
--- /dev/null
+++ b/tests.py
@@ -0,0 +1,34 @@
+def test_dict_str_format() -> str:
+    f = open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'a', encoding='utf8')
+    id = '5'
+    body = 'Hello world'
+    mydict = {
+        'id': id,
+        'body': body
+    }
+    f.write(str(mydict) + '\n')
+    f.close()
+
+    f = open('C:\\Users\\Hecko\\Desktop\\xd.txt', 'r', encoding='utf8')
+    mystr = f.readline()
+    f.close()
+    return mystr
+
+def test_dict_tuple_key() -> None:
+    tmp = [('5', 'Hello'), ('3', 'World'), ('4', 'Anything')]
+    result = {}
+
+    for tuple_key in tmp:
+        result[tuple_key] = [tuple_key[0]]
+
+    id = '3'
+
+    for key in tmp:
+        print(key)
+        if key[0] == id:
+            print('Match')
+        else:
+            print('No match')
+
+    print(result)
+    print(type(result))
\ No newline at end of file
-- 
GitLab