Skip to content
Snippets Groups Projects
Commit 75dab8ba authored by Mesharo's avatar Mesharo
Browse files

fix analyze, fix kapitoly

parent a589ae26
No related merge requests found
No preview for this file type
import sqlglot.dialects
import sqlglot.dialects.dialect
import sqlglot.optimizer
import sqlglot.optimizer.qualify
from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections
......@@ -26,8 +28,54 @@ def strip_tags(html):
s.feed(html)
return s.get_data()
def erase_html(code_section: str) -> str:
result = strip_tags(' '.join(code_section))
def replace_unrecognized_letters(result: str) -> str:
result = result.replace('E\'', '\'')
result = result.replace('`', '')
result = result.replace('ł', 'l')
result = result.replace('ø', 'o')
if 'psql:latest.dump:1601: invalid command' in result:
return ''
result = result.replace('£', '$')
result = result.replace('è', 'e').replace('é', 'e').replace('É', 'e')
result = result.replace('í', 'i')
result = result.replace('ç', 'c')
result = result.replace('õ', 'o')
if '<?php if($funcao==1 )' in result:
return ''
result = result.replace('á', 'a')
result = result.replace('ó', 'o')
result = result.replace('ń', 'n')
result = result.replace('ñ', 'n')
if 'else { $GLOBALS[komunikat_edycja_agenta' in result:
return ''
result = result.replace('ã', 'a')
result = result.replace('©', 'c')
result = result.replace('ü', 'u')
result = result.replace('×', 'x')
result = result.replace('ú', 'u')
result = result.replace('ê', 'e')
result = result.replace('«', '').replace('»', '')
result = result.replace('à', 'a')
result = result.replace('ф', '')
result = result.replace('ô', 'o')
result = result.replace('ä', 'a')
result = result.replace('§', '$')
result = result.replace('Ú', 'U')
result = result.replace('ö', 'o')
result = result.replace('â', 'a')
result = result.replace('º', '')
result = result.replace('¡', '')
result = result.replace('ȼ', '')
result = result.replace('ß', 's')
result = result.replace('с', 'c')
result = result.replace('С', 'C')
result = result.replace('Â', 'A')
result = result.replace('÷', '').replace('®', '').replace('¤', '').replace('¦', '')
return result
def erase_html(code_section: list) -> str:
result = strip_tags(';'.join(code_section))
pattern = r"\\+['a-zA-Z]"
matches = re.findall(pattern, result)
......@@ -43,34 +91,40 @@ def erase_html(code_section: str) -> str:
result = result.replace(match, '\n')
else:
result = result.replace(match, match[-2:])
return result
return replace_unrecognized_letters(result)
def analyze(id: str, codes: list) -> tuple:
parsed = 0
not_parsed = 0
for code in codes:
code = erase_html(code)
correct = False
try:
expression_tree = sqlglot.parse(code, dialect='postgres')
correct = True
except sqlglot.errors.ParseError:
correct = False
except sqlglot.errors.TokenError:
correct = False
except:
correct = False
if correct:
parsed += 1
else:
not_parsed += 1
return (parsed, not_parsed)
is_none = 0
for code_list in codes:
all_codes_string = erase_html(code_list)
for code in all_codes_string.split(';'):
#print('----------------------')
#print(code)
correct = False
try:
expression_tree = sqlglot.parse(code, dialect='postgres')
if expression_tree == [None]:
is_none += 1
correct = True
except sqlglot.errors.ParseError as pe:
correct = False
#print(f'----\nParseError: {pe}\n-----')
except sqlglot.errors.TokenError as te:
correct = False
#print(f'----\nTokenError: {te}\n-----')
except:
correct = False
if correct:
parsed += 1
else:
not_parsed += 1
return (parsed, not_parsed, is_none)
def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None:
"""Main function.
......@@ -88,17 +142,30 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
if os.path.isfile(input_filepath_codes):
codes = load_code_sections(input_filepath_codes)
count = 0
parsed = 0
not_parsed = 0
is_none = 0
for key, values in codes.items():
if count < 1000:
tmp = analyze(key, values)
parsed += tmp[0]
not_parsed += tmp[1]
count += 1
print(f'Parsed: {parsed}, not parsed: {not_parsed}')
# Parsed: 931, not parsed: 1802
if not values:
continue
tmp = analyze(key, values)
parsed += tmp[0]
not_parsed += tmp[1]
is_none += tmp[2]
print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed)')
print('DONE!')
# Parsed: 931, not parsed: 1802 (původně)
# Parsed: 1122, not parsed: 2251 (ignor prázdných)
# Parsed: 5189, not parsed: 3887 (split na sekce, bez dialektu postgres)
# Parsed: 9723, not parsed: 6415 (split ;)
# Parsed: 9769, not parsed: 6369 (E' -> ', no `)
# Parsed: 9571, not parsed: 6567 (added dialect='postgres')
# Parsed: 1192213, not parsed: 886185 (all)
# Parsed: 1192213, not parsed: 886185, None: 198696 (included in Parsed)
return
if os.path.isfile(input_filepath_linked):
......@@ -116,7 +183,6 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
link_questions_with_answers(input_filepath_all_answers, get_questions(input_filepath_postgresql_questions), input_filepath_linked)
#creates input_filepath_codes
save_code_sections(input_filepath_linked, input_filepath_codes)
run(input_filepath_all_answers, input_filepath_postgresql_questions, input_filepath_linked, input_filepath_codes)
if __name__ == "__main__":
......@@ -125,9 +191,8 @@ if __name__ == "__main__":
"""
DONE:
1. config file, soubory do files/
2. struktura kapitoly, nová kapitola ohledně schémat
3. test na větším počtu
"""
1. Fix analyze
2. Fix kapitoly
#TODO Fix analyze()
\ No newline at end of file
- PL/pgSQL zahazuju (DECLARE @OuterPageSize int)
"""
\ No newline at end of file
......@@ -168,8 +168,18 @@ def tmp():
except sqlglot.errors.OptimizeError:
expression_tree = undo
if __name__ == '__main__':
#print('SQLMETADATA: ')
#sqlmetadata()
def whatever():
expr = "SELECT hello FROM y;"
expr_split = expr.split(';')
for exp in expr_split:
try:
expression_tree = sqlglot.parse(exp)
print(repr(expression_tree))
if expression_tree == [None]:
print('is None')
except sqlglot.errors.ParseError as pe:
print(f'ParseError: {pe}')
tmp()
\ No newline at end of file
if __name__ == '__main__':
whatever()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment