Skip to content
Snippets Groups Projects
Commit 93dd4560 authored by Mesharo's avatar Mesharo
Browse files

dataset, kapitola

parent 75dab8ba
No related merge requests found
No preview for this file type
......@@ -89,12 +89,24 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) ->
question_tags -- string of the current question's tags.
"""
correct = False
for tag in postgresql_tags:
if tag in question_tags:
correct = True
question_tags_split = re.findall('<(.+?)>', question_tags)
found_sql_tag = False
for question_tag in question_tags_split:
if question_tag == 'sql':
found_sql_tag = True
break
return correct
if not found_sql_tag:
return False
question_tags_split.remove('sql')
for question_tag in question_tags_split:
for postgresql_tag in postgresql_tags:
if question_tag == postgresql_tag:
return True
return False
def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None:
"""Filter out postgresql questions.
......@@ -326,4 +338,11 @@ def load_code_sections(input_filepath_codes: str) -> dict:
tmp = eval(line.rstrip())
result[tmp[0]] = tmp[1]
return result
\ No newline at end of file
return result
if __name__ == '__main__':
#filter_postgresql_questions('D:\\stackoverflow.com\\Posts.xml', 'D:\\stackoverflow.com\\Tags.xml', 'D:\\postgresql_questions.txt', 'D:\\all_answers.txt')
#get_postgresql_tags('D:\\stackoverflow.com\\Tags.xml')
pass
\ No newline at end of file
import sqlglot.dialects
import sqlglot.dialects.dialect
import sqlglot.errors
import sqlglot.optimizer
import sqlglot.optimizer.qualify
from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections
import re
import os.path
import sqlglot
import copy
import config
from io import StringIO
......@@ -98,6 +98,7 @@ def analyze(id: str, codes: list) -> tuple:
parsed = 0
not_parsed = 0
is_none = 0
not_empty_columns_tables_aliases = 0
for code_list in codes:
all_codes_string = erase_html(code_list)
for code in all_codes_string.split(';'):
......@@ -109,6 +110,24 @@ def analyze(id: str, codes: list) -> tuple:
expression_tree = sqlglot.parse(code, dialect='postgres')
if expression_tree == [None]:
is_none += 1
correct = True
continue
for ast in expression_tree:
columns = ast.find_all(sqlglot.exp.Column)
tables = ast.find_all(sqlglot.exp.Table)
aliases = ast.find_all(sqlglot.exp.Alias)
if next(columns, None) is not None:
not_empty_columns_tables_aliases += 1
continue
if next(tables, None) is not None:
not_empty_columns_tables_aliases += 1
continue
if next(aliases, None) is not None:
not_empty_columns_tables_aliases += 1
continue
correct = True
except sqlglot.errors.ParseError as pe:
correct = False
......@@ -116,6 +135,9 @@ def analyze(id: str, codes: list) -> tuple:
except sqlglot.errors.TokenError as te:
correct = False
#print(f'----\nTokenError: {te}\n-----')
except sqlglot.errors.OptimizeError as oe:
correct = False
#print(f'----\nOptimizeError: {oe}\n-----')
except:
correct = False
......@@ -124,7 +146,7 @@ def analyze(id: str, codes: list) -> tuple:
else:
not_parsed += 1
return (parsed, not_parsed, is_none)
return (parsed, not_parsed, is_none, not_empty_columns_tables_aliases)
def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None:
"""Main function.
......@@ -145,6 +167,7 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
parsed = 0
not_parsed = 0
is_none = 0
not_empty_columns_tables_aliases = 0
for key, values in codes.items():
if not values:
......@@ -154,18 +177,12 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
parsed += tmp[0]
not_parsed += tmp[1]
is_none += tmp[2]
not_empty_columns_tables_aliases += tmp[3]
print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed)')
print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed), Found col/table/alias: {not_empty_columns_tables_aliases}')
print('DONE!')
# Parsed: 931, not parsed: 1802 (původně)
# Parsed: 1122, not parsed: 2251 (ignor prázdných)
# Parsed: 5189, not parsed: 3887 (split na sekce, bez dialektu postgres)
# Parsed: 9723, not parsed: 6415 (split ;)
# Parsed: 9769, not parsed: 6369 (E' -> ', no `)
# Parsed: 9571, not parsed: 6567 (added dialect='postgres')
# Parsed: 1192213, not parsed: 886185 (all)
# Parsed: 1192213, not parsed: 886185, None: 198696 (included in Parsed)
# new dataset: Parsed: 424400, not parsed: 201498, None: 80004 (included in Parsed)
# Parsed: 344396, not parsed: 201498, None: 80004 (not included in Parsed), Found col/table/alias: 280418
return
if os.path.isfile(input_filepath_linked):
......@@ -187,12 +204,4 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
if __name__ == "__main__":
run(config.filepaths['all_answers'], config.filepaths['postgresql_questions'], config.filepaths['linked'], config.filepaths['codes'])
"""
DONE:
1. Fix analyze
2. Fix kapitoly
- PL/pgSQL zahazuju (DECLARE @OuterPageSize int)
"""
\ No newline at end of file
\ No newline at end of file
......@@ -78,6 +78,7 @@ def testing2():
print(mydict)
import sqlglot.optimizer.scope
import sqlparse
def erasing_backslashes():
xd = """SELECT \\\'ALTER TABLE "\\\'||nspname||\\\'"."\\\'||relname||\\\'" DROP CONSTRAINT "\\\'||conname||\\\'";\\\'
......@@ -118,6 +119,7 @@ def sqlmetadata():
import sqlglot
import sqlglot.optimizer
import sqlglot.optimizer.qualify
from sqlglot import exp
def sqlglottry():
for column in sqlglot.parse_one("""
insert into EscapeTest (text) values (E'This is the first part
......@@ -169,17 +171,65 @@ def tmp():
expression_tree = undo
def whatever():
expr = "SELECT hello FROM y;"
expr_split = expr.split(';')
for exp in expr_split:
try:
expression_tree = sqlglot.parse(exp)
print(repr(expression_tree))
if expression_tree == [None]:
print('is None')
except sqlglot.errors.ParseError as pe:
print(f'ParseError: {pe}')
expr = "SELECT xd.hello FROM y AS xd"
try:
expression_tree = sqlglot.parse(expr)
print(repr(expression_tree))
for tmp in expression_tree:
columns = tmp.find_all(exp.Column)
tables = tmp.find_all(exp.Table)
aliases = tmp.find_all(exp.Alias)
print('Columns!')
for column in columns:
print(column.name)
print('Tables!')
for table in tables:
print(table.name)
print('Aliases!')
for alias in aliases:
print(alias.name)
"""
for node in expression_tree.args['expressions']:
if isinstance(node, exp.Column):
if (node.args['this']):
print(f'Column name: {node.args["this"]}')
if (node.args['table']):
print(f'from table: {node.args["table"]}')
if isinstance(node, exp.Alias):
if (node.args['this']):
print(f'Column name: {node.args["this"]}')
if isinstance(node, exp.Table):
if (node.args['this']):
print(f'Column name: {node.args["this"]}')
if (node.args['alias']):
print(f'from table: {node.args["alias"]}')
"""
except sqlglot.errors.ParseError as pe:
print(f'ParseError: {pe}')
def test_qualify():
statement = "INSERT INTO first VALUES((SELECT hello FROM second), world);"
try:
ASTs = sqlglot.parse(statement)
print(repr(ASTs))
for AST in ASTs:
root = sqlglot.optimizer.scope.build_scope(AST)
print(root)
for scope in root.traverse():
print(scope)
except sqlglot.errors.OptimizeError as oe:
print(f'----\nOptimizeError: {oe}\n-----')
if __name__ == '__main__':
whatever()
\ No newline at end of file
#whatever()
test_qualify()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment