Skip to content
Snippets Groups Projects
Commit 93dd4560 authored by Mesharo's avatar Mesharo
Browse files

dataset, kapitola

parent 75dab8ba
No related merge requests found
No preview for this file type
...@@ -89,12 +89,24 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) -> ...@@ -89,12 +89,24 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) ->
question_tags -- string of the current question's tags. question_tags -- string of the current question's tags.
""" """
correct = False question_tags_split = re.findall('<(.+?)>', question_tags)
for tag in postgresql_tags:
if tag in question_tags: found_sql_tag = False
correct = True for question_tag in question_tags_split:
if question_tag == 'sql':
found_sql_tag = True
break break
return correct
if not found_sql_tag:
return False
question_tags_split.remove('sql')
for question_tag in question_tags_split:
for postgresql_tag in postgresql_tags:
if question_tag == postgresql_tag:
return True
return False
def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None: def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None:
"""Filter out postgresql questions. """Filter out postgresql questions.
...@@ -326,4 +338,11 @@ def load_code_sections(input_filepath_codes: str) -> dict: ...@@ -326,4 +338,11 @@ def load_code_sections(input_filepath_codes: str) -> dict:
tmp = eval(line.rstrip()) tmp = eval(line.rstrip())
result[tmp[0]] = tmp[1] result[tmp[0]] = tmp[1]
return result return result
\ No newline at end of file
if __name__ == '__main__':
#filter_postgresql_questions('D:\\stackoverflow.com\\Posts.xml', 'D:\\stackoverflow.com\\Tags.xml', 'D:\\postgresql_questions.txt', 'D:\\all_answers.txt')
#get_postgresql_tags('D:\\stackoverflow.com\\Tags.xml')
pass
\ No newline at end of file
import sqlglot.dialects import sqlglot.dialects
import sqlglot.dialects.dialect import sqlglot.dialects.dialect
import sqlglot.errors
import sqlglot.optimizer import sqlglot.optimizer
import sqlglot.optimizer.qualify import sqlglot.optimizer.qualify
from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections
import re import re
import os.path import os.path
import sqlglot import sqlglot
import copy
import config import config
from io import StringIO from io import StringIO
...@@ -98,6 +98,7 @@ def analyze(id: str, codes: list) -> tuple: ...@@ -98,6 +98,7 @@ def analyze(id: str, codes: list) -> tuple:
parsed = 0 parsed = 0
not_parsed = 0 not_parsed = 0
is_none = 0 is_none = 0
not_empty_columns_tables_aliases = 0
for code_list in codes: for code_list in codes:
all_codes_string = erase_html(code_list) all_codes_string = erase_html(code_list)
for code in all_codes_string.split(';'): for code in all_codes_string.split(';'):
...@@ -109,6 +110,24 @@ def analyze(id: str, codes: list) -> tuple: ...@@ -109,6 +110,24 @@ def analyze(id: str, codes: list) -> tuple:
expression_tree = sqlglot.parse(code, dialect='postgres') expression_tree = sqlglot.parse(code, dialect='postgres')
if expression_tree == [None]: if expression_tree == [None]:
is_none += 1 is_none += 1
correct = True
continue
for ast in expression_tree:
columns = ast.find_all(sqlglot.exp.Column)
tables = ast.find_all(sqlglot.exp.Table)
aliases = ast.find_all(sqlglot.exp.Alias)
if next(columns, None) is not None:
not_empty_columns_tables_aliases += 1
continue
if next(tables, None) is not None:
not_empty_columns_tables_aliases += 1
continue
if next(aliases, None) is not None:
not_empty_columns_tables_aliases += 1
continue
correct = True correct = True
except sqlglot.errors.ParseError as pe: except sqlglot.errors.ParseError as pe:
correct = False correct = False
...@@ -116,6 +135,9 @@ def analyze(id: str, codes: list) -> tuple: ...@@ -116,6 +135,9 @@ def analyze(id: str, codes: list) -> tuple:
except sqlglot.errors.TokenError as te: except sqlglot.errors.TokenError as te:
correct = False correct = False
#print(f'----\nTokenError: {te}\n-----') #print(f'----\nTokenError: {te}\n-----')
except sqlglot.errors.OptimizeError as oe:
correct = False
#print(f'----\nOptimizeError: {oe}\n-----')
except: except:
correct = False correct = False
...@@ -124,7 +146,7 @@ def analyze(id: str, codes: list) -> tuple: ...@@ -124,7 +146,7 @@ def analyze(id: str, codes: list) -> tuple:
else: else:
not_parsed += 1 not_parsed += 1
return (parsed, not_parsed, is_none) return (parsed, not_parsed, is_none, not_empty_columns_tables_aliases)
def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None: def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None:
"""Main function. """Main function.
...@@ -145,6 +167,7 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st ...@@ -145,6 +167,7 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
parsed = 0 parsed = 0
not_parsed = 0 not_parsed = 0
is_none = 0 is_none = 0
not_empty_columns_tables_aliases = 0
for key, values in codes.items(): for key, values in codes.items():
if not values: if not values:
...@@ -154,18 +177,12 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st ...@@ -154,18 +177,12 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
parsed += tmp[0] parsed += tmp[0]
not_parsed += tmp[1] not_parsed += tmp[1]
is_none += tmp[2] is_none += tmp[2]
not_empty_columns_tables_aliases += tmp[3]
print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed)') print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed), Found col/table/alias: {not_empty_columns_tables_aliases}')
print('DONE!') print('DONE!')
# Parsed: 931, not parsed: 1802 (původně) # new dataset: Parsed: 424400, not parsed: 201498, None: 80004 (included in Parsed)
# Parsed: 1122, not parsed: 2251 (ignor prázdných) # Parsed: 344396, not parsed: 201498, None: 80004 (not included in Parsed), Found col/table/alias: 280418
# Parsed: 5189, not parsed: 3887 (split na sekce, bez dialektu postgres)
# Parsed: 9723, not parsed: 6415 (split ;)
# Parsed: 9769, not parsed: 6369 (E' -> ', no `)
# Parsed: 9571, not parsed: 6567 (added dialect='postgres')
# Parsed: 1192213, not parsed: 886185 (all)
# Parsed: 1192213, not parsed: 886185, None: 198696 (included in Parsed)
return return
if os.path.isfile(input_filepath_linked): if os.path.isfile(input_filepath_linked):
...@@ -187,12 +204,4 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st ...@@ -187,12 +204,4 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
if __name__ == "__main__": if __name__ == "__main__":
run(config.filepaths['all_answers'], config.filepaths['postgresql_questions'], config.filepaths['linked'], config.filepaths['codes']) run(config.filepaths['all_answers'], config.filepaths['postgresql_questions'], config.filepaths['linked'], config.filepaths['codes'])
\ No newline at end of file
"""
DONE:
1. Fix analyze
2. Fix kapitoly
- PL/pgSQL zahazuju (DECLARE @OuterPageSize int)
"""
\ No newline at end of file
...@@ -78,6 +78,7 @@ def testing2(): ...@@ -78,6 +78,7 @@ def testing2():
print(mydict) print(mydict)
import sqlglot.optimizer.scope
import sqlparse import sqlparse
def erasing_backslashes(): def erasing_backslashes():
xd = """SELECT \\\'ALTER TABLE "\\\'||nspname||\\\'"."\\\'||relname||\\\'" DROP CONSTRAINT "\\\'||conname||\\\'";\\\' xd = """SELECT \\\'ALTER TABLE "\\\'||nspname||\\\'"."\\\'||relname||\\\'" DROP CONSTRAINT "\\\'||conname||\\\'";\\\'
...@@ -118,6 +119,7 @@ def sqlmetadata(): ...@@ -118,6 +119,7 @@ def sqlmetadata():
import sqlglot import sqlglot
import sqlglot.optimizer import sqlglot.optimizer
import sqlglot.optimizer.qualify import sqlglot.optimizer.qualify
from sqlglot import exp
def sqlglottry(): def sqlglottry():
for column in sqlglot.parse_one(""" for column in sqlglot.parse_one("""
insert into EscapeTest (text) values (E'This is the first part insert into EscapeTest (text) values (E'This is the first part
...@@ -169,17 +171,65 @@ def tmp(): ...@@ -169,17 +171,65 @@ def tmp():
expression_tree = undo expression_tree = undo
def whatever(): def whatever():
expr = "SELECT hello FROM y;" expr = "SELECT xd.hello FROM y AS xd"
expr_split = expr.split(';') try:
for exp in expr_split: expression_tree = sqlglot.parse(expr)
try: print(repr(expression_tree))
expression_tree = sqlglot.parse(exp)
print(repr(expression_tree)) for tmp in expression_tree:
if expression_tree == [None]: columns = tmp.find_all(exp.Column)
print('is None') tables = tmp.find_all(exp.Table)
aliases = tmp.find_all(exp.Alias)
except sqlglot.errors.ParseError as pe:
print(f'ParseError: {pe}') print('Columns!')
for column in columns:
print(column.name)
print('Tables!')
for table in tables:
print(table.name)
print('Aliases!')
for alias in aliases:
print(alias.name)
"""
for node in expression_tree.args['expressions']:
if isinstance(node, exp.Column):
if (node.args['this']):
print(f'Column name: {node.args["this"]}')
if (node.args['table']):
print(f'from table: {node.args["table"]}')
if isinstance(node, exp.Alias):
if (node.args['this']):
print(f'Column name: {node.args["this"]}')
if isinstance(node, exp.Table):
if (node.args['this']):
print(f'Column name: {node.args["this"]}')
if (node.args['alias']):
print(f'from table: {node.args["alias"]}')
"""
except sqlglot.errors.ParseError as pe:
print(f'ParseError: {pe}')
def test_qualify():
statement = "INSERT INTO first VALUES((SELECT hello FROM second), world);"
try:
ASTs = sqlglot.parse(statement)
print(repr(ASTs))
for AST in ASTs:
root = sqlglot.optimizer.scope.build_scope(AST)
print(root)
for scope in root.traverse():
print(scope)
except sqlglot.errors.OptimizeError as oe:
print(f'----\nOptimizeError: {oe}\n-----')
if __name__ == '__main__': if __name__ == '__main__':
whatever() #whatever()
\ No newline at end of file test_qualify()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment