Skip to content
Snippets Groups Projects
Commit 3a86d2a4 authored by Mesharo's avatar Mesharo
Browse files

cleanup, filepaths, comments

parent 4c1c282c
No related merge requests found
import re import re
from bigxml import Parser, xml_handle_element from bigxml import Parser, xml_handle_element
import time
from tests import * def get_postgresql_tags(input_filepath_tags_xml: str) -> list:
"""Return a list of wanted postgresql tags.
# wanna find postgresql and postgresql-version tags
def get_postgresql_tags(file_name: str) -> list: Argument:
input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
"""
result = [] result = []
file = open(file_name, 'r', encoding='utf8') file = open(input_filepath_tags_xml, 'r', encoding='utf8')
for row in file: for row in file:
id_tagname = re.search('<row Id="(.+?)" TagName="(.+?)"', row) id_tagname = re.search('<row Id="(.+?)" TagName="(.+?)"', row)
...@@ -38,33 +40,55 @@ def get_postgresql_tags(file_name: str) -> list: ...@@ -38,33 +40,55 @@ def get_postgresql_tags(file_name: str) -> list:
@xml_handle_element('posts', 'row') @xml_handle_element('posts', 'row')
def handler(node): def handler(node):
"""Handler for BigXML parser.
Function which parses the XML row and
yields user-defined structure.
"""
post_type_id = node.attributes['PostTypeId']
id = node.attributes['Id'] id = node.attributes['Id']
body = node.attributes['Body'] body = node.attributes['Body']
try: # question
tags = node.attributes['Tags'] if post_type_id == '1':
except KeyError: try:
tags = 'N/A' tags = node.attributes['Tags']
try: yield {
parent_id = node.attributes['ParentId'] 'post_type_id': post_type_id,
except KeyError: 'id': id,
parent_id = 'N/A' 'body': body,
'tags': tags,
try: }
accepted_answer_id = node.attributes['AcceptedAnswerId'] except KeyError:
except KeyError: yield 'N/A'
accepted_answer_id = 'N/A'
# answer
yield { elif post_type_id == '2':
'id': id, try:
'body': body, parent_id = node.attributes['ParentId']
'tags': tags,
'parent_id': parent_id, yield {
'accepted_answer_id': accepted_answer_id 'post_type_id': post_type_id,
} 'id': id,
'body': body,
'parent_id': parent_id
}
except KeyError:
yield 'N/A'
else:
yield 'N/A'
def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> bool: def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> bool:
"""
Check for postgresql tags.
Return true when found.
Arguments:
postgresql_tags -- list of wanted postgresql tags ('postgresql', versions).
question_tags -- string of the current question's tags.
"""
correct = False correct = False
for tag in postgresql_tags: for tag in postgresql_tags:
if tag in question_tags: if tag in question_tags:
...@@ -72,23 +96,39 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> ...@@ -72,23 +96,39 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) ->
break break
return correct return correct
# filter out wanted questions and save them in a separate file for quicker access def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None:
def filter_postgresql_questions(file_name: str) -> None: """Filter out postgresql questions.
postgresql_tags = get_postgresql_tags('D:\stackoverflow.com\Tags.xml')
output_file_questions = open('D:\postgresql_questions.txt', 'a', encoding='utf8') Read posts from a file by doses (1GB).
output_file_all_answers = open('D:\\all_answers.txt', 'a', encoding='utf8') Let parser and handler yield row values in expected structure.
Distinguish between questions and answers.
Check for postgresql tags, save if found.
Arguments:
input_filepath_posts_xml -- path to the Posts.xml from Stackoverflow's data dump.
input_filepath_tags_xml -- path to the Tags.xml from Stackoverflow's data dump.
output_filepath_questions -- path where we save filtered questions.
output_filepath_answers -- path where we save all answers.
"""
postgresql_tags = get_postgresql_tags(input_filepath_tags_xml)
output_file_questions = open(output_filepath_questions, 'a', encoding='utf8')
output_file_all_answers = open(output_filepath_answers, 'a', encoding='utf8')
with open(file_name, 'rb') as XML_file: with open(input_filepath_posts_xml, 'rb') as XML_file:
rows = XML_file.readlines(1000000000) rows = XML_file.readlines(1000000000)
# todo - encoding to utf8 for each readlines (insert header into list) while len(rows) > 2:
while len(rows) > 1: if rows[len(rows) - 1] != b'</posts>':
rows.append(b'</posts>') rows.append(b'</posts>')
for item in Parser(rows).iter_from(handler): for item in Parser(rows).iter_from(handler):
# answer # answer
if item['tags'] == 'N/A': if item == 'N/A':
continue
if item['post_type_id'] == '2':
output_file_all_answers.write(str(item) + '\n') output_file_all_answers.write(str(item) + '\n')
continue continue
...@@ -97,41 +137,81 @@ def filter_postgresql_questions(file_name: str) -> None: ...@@ -97,41 +137,81 @@ def filter_postgresql_questions(file_name: str) -> None:
output_file_questions.write(str(item) + '\n') output_file_questions.write(str(item) + '\n')
rows = XML_file.readlines(1000000000) rows = XML_file.readlines(1000000000)
rows.insert(0, b'<posts>') rows.insert(0, b'<?xml version="1.0" encoding="utf-8"?>')
rows.insert(1, b'<posts>')
output_file_questions.close() output_file_questions.close()
output_file_all_answers.close() output_file_all_answers.close()
def get_questions() -> list: def get_questions(input_filepath_questions) -> list:
input_file_questions = open('D:\postgresql_questions.txt', 'r', encoding='utf8') """
Read postgresql questions into a list, represented as tuples.
Argument:
input_filepath_questions -- path to the file with postgresql questions.
"""
input_file_questions = open(input_filepath_questions, 'r', encoding='utf8')
result = [] result = []
for row in input_file_questions: for row in input_file_questions:
id_body = re.search("{'id': '(.+?)', 'body': '(.+?)'", row) id_body = re.search("{'post_type_id': '1', 'id': '(.+?)', 'body': (.+?), 'tags'", row)
if not id_body: if not id_body:
print('Failed to read question from file!') print('Failed to read question from file!')
continue continue
tuple_id_body = id_body.group(1, 2) tuple_id_body = id_body.group(1, 2)
result.append(tuple_id_body) result.append(('1',) + tuple_id_body)
input_file_questions.close() input_file_questions.close()
return result return result
# not tested def save_linked(output_filepath: str, linked_questions_answers: dict) -> None:
def link_questions_with_answers(questions: list) -> dict: """
input_file_answers = open('D:\\all_answers.txt', 'r', encoding='utf8') Write linked postgresql questions with their answers into a file.
Arguments:
output_filepath -- path where we want to save everything.
linked_questions_answers -- dictionary with questions as keys and lists of answers as values
"""
output_file = open(output_filepath, 'a', encoding='utf8')
for question, answers in linked_questions_answers.items():
output_file.write(str(question) + '\n')
for answer in answers:
output_file.write(str(answer) + '\n')
output_file.close()
def link_questions_with_answers(input_filepath_answers: str, questions: list) -> None:
"""Link postgresql questions with corresponding answers.
Sort postgresql question based on their ID.
Load answers, check their parent_id attribute.
Link postgresql answers to their questions.
Call function save_linked(result: dict) to save all into a file.
Arguments:
input_filepath_answers -- path to the file with answers.
questions -- list with postgresql questions, represented as tuples.
"""
input_file_answers = open(input_filepath_answers, 'r', encoding='utf8')
result = {} result = {}
questions.sort(key = lambda x: int(x[1]))
for tuple_key in questions: for tuple_key in questions:
result[tuple_key] = [] result[tuple_key[1]] = []
rows = input_file_answers.readlines(1000000000) rows = input_file_answers.readlines(1000000000)
while len(rows) > 0: while len(rows) > 0:
for answer in rows: for answer in rows:
id_body_parent = re.search("{'id': '(.+?)', 'body': '(.+?)', 'tags': 'N/A', 'parent_id': '(.+?)'", answer) id_body_parent = re.search("{'post_type_id': '2', 'id': '(.+?)', 'body': (.+?), 'parent_id': '(.+?)'", answer)
if not id_body_parent: if not id_body_parent:
print('Failed to read answer from file!') print('Failed to read answer from file!')
...@@ -139,20 +219,46 @@ def link_questions_with_answers(questions: list) -> dict: ...@@ -139,20 +219,46 @@ def link_questions_with_answers(questions: list) -> dict:
tuple_id_body_parent = id_body_parent.group(1, 2, 3) tuple_id_body_parent = id_body_parent.group(1, 2, 3)
for key in result: if result.get(tuple_id_body_parent[2]) is not None:
if key[0] == tuple_id_body_parent[2]: result[tuple_id_body_parent[2]].append(('2',) + tuple_id_body_parent)
result[key].append(tuple_id_body_parent)
break
rows = input_file_answers.readlines(1000000000) rows = input_file_answers.readlines(1000000000)
input_file_answers.close() input_file_answers.close()
result = dict(zip(questions, result.values()))
save_linked(result)
return result return result
#{'id': id, 'body': body, 'tags': tags, 'parent_id': parent_id, 'accepted_answer_id': accepted_answer_id} def retrieve_linked(input_filepath_linked: str) -> dict:
"""Get questions and answers.
Read saved postgresql questions with their corresponding
answers from a file into a dictionary.
Argument:
input_filepath_linked -- path to the file with questions and answers.
"""
input_file = open(input_filepath_linked, 'r', encoding='utf8')
result = {}
row = input_file.readline()
current = -1
while row:
row_tuple = eval(row)
if row_tuple[0] == '1':
current = row_tuple[1::]
result[current] = []
else:
result[current].append(row_tuple[1::])
row = input_file.readline()
input_file.close()
return result
if __name__ == "__main__": if __name__ == "__main__":
# filter_postgresql_questions('D:\stackoverflow.com\Posts.xml') questions_answers = retrieve_linked('D:\\final.txt')
questions = get_questions() \ No newline at end of file
# questions_answers = link_questions_with_answers(get_questions)
\ No newline at end of file
...@@ -31,4 +31,49 @@ def test_dict_tuple_key() -> None: ...@@ -31,4 +31,49 @@ def test_dict_tuple_key() -> None:
print('No match') print('No match')
print(result) print(result)
print(type(result)) print(type(result))
\ No newline at end of file
def wtha():
with open('D:\\all_answers.txt', 'r', encoding='utf8') as f:
for row in f:
row_tmp = eval(row)
if row_tmp['id'] == '4416':
print(row)
def testing():
with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'w', encoding='utf8') as output_fule:
output_fule.write(str(('1', 'Hello1')) + '\n')
output_fule.write(str(('2', 'Hello2')) + '\n')
output_fule.write(str(('2', 'Hello3')) + '\n')
output_fule.write(str(('1', 'Hello4')) + '\n')
output_fule.write(str(('2', 'Hello5')) + '\n')
result = {}
with open('C:\\Users\\Hecko\\Desktop\\tmp.txt', 'r', encoding='utf8') as input_fule:
row = input_fule.readline()
current = eval(row)[1]
while row:
if (eval(row)[0] == '1'):
current = eval(row)[1]
result[current] = []
else:
result[current].append(eval(row))
row = input_fule.readline()
print(result)
def testing2():
questions = [('1', 'hello', 'yush')]
mydict = {
('1', '2'): "any",
('2', '3'): "any2",
('3', '4'): "any3",
('4', '5'): "any4",
('5', '6'): "any5",
}
print(mydict)
mydict = dict(zip(questions, mydict.values()))
print(mydict)
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment