Skip to content
Snippets Groups Projects
Commit 3a86d2a4 authored by Mesharo's avatar Mesharo
Browse files

cleanup, filepaths, comments

parent 4c1c282c
No related merge requests found
import re
from bigxml import Parser, xml_handle_element
import time
from tests import *
# wanna find postgresql and postgresql-version tags
def get_postgresql_tags(file_name: str) -> list:
def get_postgresql_tags(input_filepath_tags_xml: str) -> list:
"""Return a list of wanted postgresql tags.
Argument:
input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
"""
result = []
file = open(file_name, 'r', encoding='utf8')
file = open(input_filepath_tags_xml, 'r', encoding='utf8')
for row in file:
id_tagname = re.search('<row Id="(.+?)" TagName="(.+?)"', row)
......@@ -38,33 +40,55 @@ def get_postgresql_tags(file_name: str) -> list:
@xml_handle_element('posts', 'row')
def handler(node):
"""Handler for BigXML parser.
Function which parses the XML row and
yields user-defined structure.
"""
post_type_id = node.attributes['PostTypeId']
id = node.attributes['Id']
body = node.attributes['Body']
try:
tags = node.attributes['Tags']
except KeyError:
tags = 'N/A'
try:
parent_id = node.attributes['ParentId']
except KeyError:
parent_id = 'N/A'
try:
accepted_answer_id = node.attributes['AcceptedAnswerId']
except KeyError:
accepted_answer_id = 'N/A'
yield {
'id': id,
'body': body,
'tags': tags,
'parent_id': parent_id,
'accepted_answer_id': accepted_answer_id
}
# question
if post_type_id == '1':
try:
tags = node.attributes['Tags']
yield {
'post_type_id': post_type_id,
'id': id,
'body': body,
'tags': tags,
}
except KeyError:
yield 'N/A'
# answer
elif post_type_id == '2':
try:
parent_id = node.attributes['ParentId']
yield {
'post_type_id': post_type_id,
'id': id,
'body': body,
'parent_id': parent_id
}
except KeyError:
yield 'N/A'
else:
yield 'N/A'
def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> bool:
"""
Check for postgresql tags.
Return true when found.
Arguments:
postgresql_tags -- list of wanted postgresql tags ('postgresql', versions).
question_tags -- string of the current question's tags.
"""
correct = False
for tag in postgresql_tags:
if tag in question_tags:
......@@ -72,23 +96,39 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) ->
break
return correct
# filter out wanted questions and save them in a separate file for quicker access
def filter_postgresql_questions(file_name: str) -> None:
postgresql_tags = get_postgresql_tags('D:\stackoverflow.com\Tags.xml')
output_file_questions = open('D:\postgresql_questions.txt', 'a', encoding='utf8')
output_file_all_answers = open('D:\\all_answers.txt', 'a', encoding='utf8')
def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None:
"""Filter out postgresql questions.
Read posts from a file by doses (1GB).
Let parser and handler yield row values in expected structure.
Distinguish between questions and answers.
Check for postgresql tags, save if found.
Arguments:
input_filepath_posts_xml -- path to the Posts.xml from Stackoverflow's data dump.
input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
output_filepath_questions -- path where we save filtered questions.
output_filepath_answers -- path where we save all answers.
"""
postgresql_tags = get_postgresql_tags(input_filepath_tags_xml)
output_file_questions = open(output_filepath_questions, 'a', encoding='utf8')
output_file_all_answers = open(output_filepath_answers, 'a', encoding='utf8')
with open(file_name, 'rb') as XML_file:
with open(input_filepath_posts_xml, 'rb') as XML_file:
rows = XML_file.readlines(1000000000)
# todo - encoding to utf8 for each readlines (insert header into list)
while len(rows) > 1:
rows.append(b'</posts>')
while len(rows) > 2:
if rows[len(rows) - 1] != b'</posts>':
rows.append(b'</posts>')
for item in Parser(rows).iter_from(handler):
# answer
if item['tags'] == 'N/A':
if item == 'N/A':
continue
if item['post_type_id'] == '2':
output_file_all_answers.write(str(item) + '\n')
continue
......@@ -97,41 +137,81 @@ def filter_postgresql_questions(file_name: str) -> None:
output_file_questions.write(str(item) + '\n')
rows = XML_file.readlines(1000000000)
rows.insert(0, b'<posts>')
rows.insert(0, b'<?xml version="1.0" encoding="utf-8"?>')
rows.insert(1, b'<posts>')
output_file_questions.close()
output_file_all_answers.close()
def get_questions() -> list:
input_file_questions = open('D:\postgresql_questions.txt', 'r', encoding='utf8')
def get_questions(input_filepath_questions) -> list:
"""
Read postgresql questions into a list, represented as tuples.
Argument:
input_filepath_questions -- path to the file with postgresql questions.
"""
input_file_questions = open(input_filepath_questions, 'r', encoding='utf8')
result = []
for row in input_file_questions:
id_body = re.search("{'id': '(.+?)', 'body': '(.+?)'", row)
id_body = re.search("{'post_type_id': '1', 'id': '(.+?)', 'body': (.+?), 'tags'", row)
if not id_body:
print('Failed to read question from file!')
continue
tuple_id_body = id_body.group(1, 2)
result.append(tuple_id_body)
result.append(('1',) + tuple_id_body)
input_file_questions.close()
return result
# not tested
def link_questions_with_answers(questions: list) -> dict:
input_file_answers = open('D:\\all_answers.txt', 'r', encoding='utf8')
def save_linked(output_filepath: str, linked_questions_answers: dict) -> None:
"""
Write linked postgresql questions with their answers into a file.
Arguments:
output_filepath -- path where we want to save everything.
linked_questions_answers -- dictionary with questions as keys and lists of answers as values
"""
output_file = open(output_filepath, 'a', encoding='utf8')
for question, answers in linked_questions_answers.items():
output_file.write(str(question) + '\n')
for answer in answers:
output_file.write(str(answer) + '\n')
output_file.close()
def link_questions_with_answers(input_filepath_answers: str, questions: list) -> None:
"""Link postgresql questions with corresponding answers.
Sort postgresql question based on their ID.
Load answers, check their parent_id attribute.
Link postgresql answers to their questions.
Call function save_linked(result: dict) to save all into a file.
Arguments:
input_filepath_answers -- path to the file with answers.
questions -- list with postgresql questions, represented as tuples.
"""
input_file_answers = open(input_filepath_answers, 'r', encoding='utf8')
result = {}
questions.sort(key = lambda x: int(x[1]))
for tuple_key in questions:
result[tuple_key] = []
result[tuple_key[1]] = []
rows = input_file_answers.readlines(1000000000)
while len(rows) > 0:
for answer in rows:
id_body_parent = re.search("{'id': '(.+?)', 'body': '(.+?)', 'tags': 'N/A', 'parent_id': '(.+?)'", answer)
id_body_parent = re.search("{'post_type_id': '2', 'id': '(.+?)', 'body': (.+?), 'parent_id': '(.+?)'", answer)
if not id_body_parent:
print('Failed to read answer from file!')
......@@ -139,20 +219,46 @@ def link_questions_with_answers(questions: list) -> dict:
tuple_id_body_parent = id_body_parent.group(1, 2, 3)
for key in result:
if key[0] == tuple_id_body_parent[2]:
result[key].append(tuple_id_body_parent)
break
if result.get(tuple_id_body_parent[2]) is not None:
result[tuple_id_body_parent[2]].append(('2',) + tuple_id_body_parent)
rows = input_file_answers.readlines(1000000000)
input_file_answers.close()
result = dict(zip(questions, result.values()))
save_linked(result)
return result
#{'id': id, 'body': body, 'tags': tags, 'parent_id': parent_id, 'accepted_answer_id': accepted_answer_id}
def retrieve_linked(input_filepath_linked: str) -> dict:
"""Get questions and answers.
Read saved postgresql questions with their corresponding
answers from a file into a dictionary.
Argument:
input_filepath_linked -- path to the file with questions and answers.
"""
input_file = open(input_filepath_linked, 'r', encoding='utf8')
result = {}
row = input_file.readline()
current = -1
while row:
row_tuple = eval(row)
if row_tuple[0] == '1':
current = row_tuple[1::]
result[current] = []
else:
result[current].append(row_tuple[1::])
row = input_file.readline()
input_file.close()
return result
if __name__ == "__main__":
# filter_postgresql_questions('D:\stackoverflow.com\Posts.xml')
questions = get_questions()
# questions_answers = link_questions_with_answers(get_questions)
\ No newline at end of file
questions_answers = retrieve_linked('D:\\final.txt')
\ No newline at end of file
......@@ -31,4 +31,49 @@ def test_dict_tuple_key() -> None:
print('No match')
print(result)
print(type(result))
\ No newline at end of file
print(type(result))
def wtha():
with open('D:\\all_answers.txt', 'r', encoding='utf8') as f:
for row in f:
row_tmp = eval(row)
if row_tmp['id'] == '4416':
print(row)
def testing():
with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'w', encoding='utf8') as output_fule:
output_fule.write(str(('1', 'Hello1')) + '\n')
output_fule.write(str(('2', 'Hello2')) + '\n')
output_fule.write(str(('2', 'Hello3')) + '\n')
output_fule.write(str(('1', 'Hello4')) + '\n')
output_fule.write(str(('2', 'Hello5')) + '\n')
result = {}
with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'r', encoding='utf8') as input_fule:
row = input_fule.readline()
current = eval(row)[1]
while row:
if (eval(row)[0] == '1'):
current = eval(row)[1]
result[current] = []
else:
result[current].append(eval(row))
row = input_fule.readline()
print(result)
def testing2():
questions = [('1', 'hello', 'yush')]
mydict = {
('1', '2'): "any",
('2', '3'): "any2",
('3', '4'): "any3",
('4', '5'): "any4",
('5', '6'): "any5",
}
print(mydict)
mydict = dict(zip(questions, mydict.values()))
print(mydict)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment