cleanup, filepaths, comments

3a86d2a4 · Mesharo · 4c1c282c · 3a86d2a4 · 3a86d2a4
Commit 3a86d2a4 authored 1 year ago by Mesharo
--- a/main.py
+++ b/main.py
 import re
 from bigxml import Parser, xml_handle_element
-import time
-from tests import *
+def get_postgresql_tags(input_filepath_tags_xml: str) -> list:
+    """Return a list of wanted postgresql tags.
-# wanna find postgresql and postgresql-version tags
-def get_postgresql_tags(file_name: str) -> list:
+    Argument:
+    input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
+    """
    result = []
-    file = open(file_name, 'r', encoding='utf8')
+    file = open(input_filepath_tags_xml, 'r', encoding='utf8')
    for row in file:
        id_tagname = re.search('<row Id="(.+?)" TagName="(.+?)"', row)
@@ -38,33 +40,55 @@ def get_postgresql_tags(file_name: str) -> list:
 @xml_handle_element('posts', 'row')
 def handler(node):
+    """Handler for BigXML parser.
+    Function which parses the XML row and
+    yields user-defined structure.
+    """
+    post_type_id = node.attributes['PostTypeId']
    id = node.attributes['Id']
    body = node.attributes['Body']
-    try:
+    # question
-        tags = node.attributes['Tags']
+    if post_type_id == '1':
-    except KeyError:
+        try:
-        tags = 'N/A'
+            tags = node.attributes['Tags']
-    try:
+            yield {
-        parent_id = node.attributes['ParentId']
+                'post_type_id': post_type_id,
-    except KeyError:
+                'id': id,
-        parent_id = 'N/A'
+                'body': body,
+                'tags': tags,
-    try:
+            }
-        accepted_answer_id = node.attributes['AcceptedAnswerId']
+        except KeyError:
-    except KeyError:
+            yield 'N/A'
-        accepted_answer_id = 'N/A'
+    # answer
-    yield {
+    elif post_type_id == '2':
-        'id': id,
+        try:
-        'body': body,
+            parent_id = node.attributes['ParentId']
-        'tags': tags,
-        'parent_id': parent_id,
+            yield {
-        'accepted_answer_id': accepted_answer_id
+                'post_type_id': post_type_id,
-    }
+                'id': id,
+                'body': body,
+                'parent_id': parent_id
+            }
+        except KeyError:
+            yield 'N/A'
+    else:
+        yield 'N/A'
 def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> bool:
+    """
+    Check for postgresql tags.
+    Return true when found.
+    Arguments:
+    postgresql_tags -- list of wanted postgresql tags ('postgresql', versions).
+    question_tags -- string of the current question's tags.
+    """
    correct = False
    for tag in postgresql_tags:
        if tag in question_tags:
@@ -72,23 +96,39 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) ->
            break
    return correct
-# filter out wanted questions and save them in a separate file for quicker access
+def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None:
-def filter_postgresql_questions(file_name: str) -> None:
+    """Filter out postgresql questions.
-    postgresql_tags = get_postgresql_tags('D:\stackoverflow.com\Tags.xml')
-    output_file_questions = open('D:\postgresql_questions.txt', 'a', encoding='utf8')
+    Read posts from a file by doses (1GB).
-    output_file_all_answers = open('D:\\all_answers.txt', 'a', encoding='utf8')
+    Let parser and handler yield row values in expected structure.
+    Distinguish between questions and answers.
+    Check for postgresql tags, save if found. 
+    Arguments:
+    input_filepath_posts_xml -- path to the Posts.xml from Stackoverflow's data dump.
+    input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
+    output_filepath_questions -- path where we save filtered questions.
+    output_filepath_answers -- path where we save all answers.
+    """
+    postgresql_tags = get_postgresql_tags(input_filepath_tags_xml)
+    output_file_questions = open(output_filepath_questions, 'a', encoding='utf8')
+    output_file_all_answers = open(output_filepath_answers, 'a', encoding='utf8')
-    with open(file_name, 'rb') as XML_file:
+    with open(input_filepath_posts_xml, 'rb') as XML_file:
        rows = XML_file.readlines(1000000000)
-        # todo - encoding to utf8 for each readlines (insert header into list)
+        while len(rows) > 2:
-        while len(rows) > 1:
+            if rows[len(rows) - 1] != b'</posts>':
-            rows.append(b'</posts>')
+                rows.append(b'</posts>')
            for item in Parser(rows).iter_from(handler):
                # answer
-                if item['tags'] == 'N/A':
+                if item == 'N/A':
+                    continue
+                if item['post_type_id'] == '2':
                    output_file_all_answers.write(str(item) + '\n')
                    continue
@@ -97,41 +137,81 @@ def filter_postgresql_questions(file_name: str) -> None:
                    output_file_questions.write(str(item) + '\n')
            rows = XML_file.readlines(1000000000)
-            rows.insert(0, b'<posts>')
+            rows.insert(0, b'<?xml version="1.0" encoding="utf-8"?>')
+            rows.insert(1, b'<posts>')
    output_file_questions.close()
    output_file_all_answers.close()
-def get_questions() -> list:
+def get_questions(input_filepath_questions) -> list:
-    input_file_questions = open('D:\postgresql_questions.txt', 'r', encoding='utf8')
+    """
+    Read postgresql questions into a list, represented as tuples.
+    Argument:
+    input_filepath_questions -- path to the file with postgresql questions.
+    """
+    input_file_questions = open(input_filepath_questions, 'r', encoding='utf8')
    result = []
    for row in input_file_questions:
-        id_body = re.search("{'id': '(.+?)', 'body': '(.+?)'", row)
+        id_body = re.search("{'post_type_id': '1', 'id': '(.+?)', 'body': (.+?), 'tags'", row)
        if not id_body:
            print('Failed to read question from file!')
            continue
        tuple_id_body = id_body.group(1, 2)
-        result.append(tuple_id_body)
+        result.append(('1',) + tuple_id_body)
    input_file_questions.close()
    return result
-# not tested
+def save_linked(output_filepath: str, linked_questions_answers: dict) -> None:
-def link_questions_with_answers(questions: list) -> dict:
+    """
-    input_file_answers = open('D:\\all_answers.txt', 'r', encoding='utf8')
+    Write linked postgresql questions with their answers into a file.
+    Arguments:
+    output_filepath -- path where we want to save everything.
+    linked_questions_answers -- dictionary with questions as keys and lists of answers as values
+    """
+    output_file = open(output_filepath, 'a', encoding='utf8')
+    for question, answers in linked_questions_answers.items():
+        output_file.write(str(question) + '\n')
+        for answer in answers:
+            output_file.write(str(answer) + '\n')
+    output_file.close()
+def link_questions_with_answers(input_filepath_answers: str, questions: list) -> None:
+    """Link postgresql questions with corresponding answers.
+    Sort postgresql question based on their ID.
+    Load answers, check their parent_id attribute.
+    Link postgresql answers to their questions.
+    Call function save_linked(result: dict) to save all into a file.
+    Arguments:
+    input_filepath_answers -- path to the file with answers.
+    questions -- list with postgresql questions, represented as tuples.
+    """
+    input_file_answers = open(input_filepath_answers, 'r', encoding='utf8')
    result = {}
+    questions.sort(key = lambda x: int(x[1]))
    for tuple_key in questions:
-        result[tuple_key] = []
+        result[tuple_key[1]] = []
    rows = input_file_answers.readlines(1000000000)
    while len(rows) > 0:
        for answer in rows:
-            id_body_parent = re.search("{'id': '(.+?)', 'body': '(.+?)', 'tags': 'N/A', 'parent_id': '(.+?)'", answer)
+            id_body_parent = re.search("{'post_type_id': '2', 'id': '(.+?)', 'body': (.+?), 'parent_id': '(.+?)'", answer)
            if not id_body_parent:
                print('Failed to read answer from file!')
@@ -139,20 +219,46 @@ def link_questions_with_answers(questions: list) -> dict:
            tuple_id_body_parent = id_body_parent.group(1, 2, 3)
-            for key in result:
+            if result.get(tuple_id_body_parent[2]) is not None:
-                if key[0] == tuple_id_body_parent[2]:
+                result[tuple_id_body_parent[2]].append(('2',) + tuple_id_body_parent)
-                    result[key].append(tuple_id_body_parent)
-                    break
        rows = input_file_answers.readlines(1000000000)
    input_file_answers.close()
+    result = dict(zip(questions, result.values()))
+    save_linked(result)
    return result
-#{'id': id, 'body': body, 'tags': tags, 'parent_id': parent_id, 'accepted_answer_id': accepted_answer_id}
+def retrieve_linked(input_filepath_linked: str) -> dict:
+    """Get questions and answers.
+    Read saved postgresql questions with their corresponding
+    answers from a file into a dictionary.
+    Argument:
+    input_filepath_linked -- path to the file with questions and answers.
+    """
+    input_file = open(input_filepath_linked, 'r', encoding='utf8')
+    result = {}
+    row = input_file.readline()
+    current = -1
+    while row:
+        row_tuple = eval(row)
+        if row_tuple[0] == '1':
+            current = row_tuple[1::]
+            result[current] = []
+        else:
+            result[current].append(row_tuple[1::])
+        row = input_file.readline()
+    input_file.close()
+    return result
 if __name__ == "__main__":
-    # filter_postgresql_questions('D:\stackoverflow.com\Posts.xml')
+    questions_answers = retrieve_linked('D:\\final.txt')
-    questions = get_questions()
\ No newline at end of file
-    # questions_answers = link_questions_with_answers(get_questions)
\ No newline at end of file
--- a/tests.py
+++ b/tests.py
@@ -31,4 +31,49 @@ def test_dict_tuple_key() -> None:
            print('No match')
    print(result)
    print(type(result))
\ No newline at end of file
+def wtha():
+    with open('D:\\all_answers.txt', 'r', encoding='utf8') as f:
+        for row in f:
+            row_tmp = eval(row)
+            if row_tmp['id'] == '4416':
+                print(row)
+def testing():
+    with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'w', encoding='utf8') as output_fule:
+        output_fule.write(str(('1', 'Hello1')) + '\n')
+        output_fule.write(str(('2', 'Hello2')) + '\n')
+        output_fule.write(str(('2', 'Hello3')) + '\n')
+        output_fule.write(str(('1', 'Hello4')) + '\n')
+        output_fule.write(str(('2', 'Hello5')) + '\n')
+    result = {}
+    with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'r', encoding='utf8') as input_fule:
+        row = input_fule.readline()
+        current = eval(row)[1]
+        while row:
+            if (eval(row)[0] == '1'):
+                current = eval(row)[1]
+                result[current] = []
+            else:
+                result[current].append(eval(row))
+            row = input_fule.readline()
+    print(result)
+def testing2():
+    questions = [('1', 'hello', 'yush')]
+    mydict = {
+        ('1', '2'): "any",
+        ('2', '3'): "any2",
+        ('3', '4'): "any3",
+        ('4', '5'): "any4",
+        ('5', '6'): "any5",
+    }
+    print(mydict)
+    mydict = dict(zip(questions, mydict.values()))
+    print(mydict)