Skip to content
Snippets Groups Projects
Commit 75dab8ba authored by Mesharo's avatar Mesharo
Browse files

fix analyze, fix kapitoly

parent a589ae26
No related merge requests found
No preview for this file type
import sqlglot.dialects
import sqlglot.dialects.dialect
import sqlglot.optimizer import sqlglot.optimizer
import sqlglot.optimizer.qualify import sqlglot.optimizer.qualify
from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections
...@@ -26,8 +28,54 @@ def strip_tags(html): ...@@ -26,8 +28,54 @@ def strip_tags(html):
s.feed(html) s.feed(html)
return s.get_data() return s.get_data()
def erase_html(code_section: str) -> str: def replace_unrecognized_letters(result: str) -> str:
result = strip_tags(' '.join(code_section)) result = result.replace('E\'', '\'')
result = result.replace('`', '')
result = result.replace('ł', 'l')
result = result.replace('ø', 'o')
if 'psql:latest.dump:1601: invalid command' in result:
return ''
result = result.replace('£', '$')
result = result.replace('è', 'e').replace('é', 'e').replace('É', 'e')
result = result.replace('í', 'i')
result = result.replace('ç', 'c')
result = result.replace('õ', 'o')
if '<?php if($funcao==1 )' in result:
return ''
result = result.replace('á', 'a')
result = result.replace('ó', 'o')
result = result.replace('ń', 'n')
result = result.replace('ñ', 'n')
if 'else { $GLOBALS[komunikat_edycja_agenta' in result:
return ''
result = result.replace('ã', 'a')
result = result.replace('©', 'c')
result = result.replace('ü', 'u')
result = result.replace('×', 'x')
result = result.replace('ú', 'u')
result = result.replace('ê', 'e')
result = result.replace('«', '').replace('»', '')
result = result.replace('à', 'a')
result = result.replace('ф', '')
result = result.replace('ô', 'o')
result = result.replace('ä', 'a')
result = result.replace('§', '$')
result = result.replace('Ú', 'U')
result = result.replace('ö', 'o')
result = result.replace('â', 'a')
result = result.replace('º', '')
result = result.replace('¡', '')
result = result.replace('ȼ', '')
result = result.replace('ß', 's')
result = result.replace('с', 'c')
result = result.replace('С', 'C')
result = result.replace('Â', 'A')
result = result.replace('÷', '').replace('®', '').replace('¤', '').replace('¦', '')
return result
def erase_html(code_section: list) -> str:
result = strip_tags(';'.join(code_section))
pattern = r"\\+['a-zA-Z]" pattern = r"\\+['a-zA-Z]"
matches = re.findall(pattern, result) matches = re.findall(pattern, result)
...@@ -43,34 +91,40 @@ def erase_html(code_section: str) -> str: ...@@ -43,34 +91,40 @@ def erase_html(code_section: str) -> str:
result = result.replace(match, '\n') result = result.replace(match, '\n')
else: else:
result = result.replace(match, match[-2:]) result = result.replace(match, match[-2:])
return result
return replace_unrecognized_letters(result)
def analyze(id: str, codes: list) -> tuple: def analyze(id: str, codes: list) -> tuple:
parsed = 0 parsed = 0
not_parsed = 0 not_parsed = 0
for code in codes: is_none = 0
code = erase_html(code) for code_list in codes:
all_codes_string = erase_html(code_list)
correct = False for code in all_codes_string.split(';'):
try: #print('----------------------')
expression_tree = sqlglot.parse(code, dialect='postgres') #print(code)
correct = True
except sqlglot.errors.ParseError:
correct = False
except sqlglot.errors.TokenError:
correct = False
except:
correct = False
if correct:
parsed += 1
else:
not_parsed += 1
return (parsed, not_parsed)
correct = False
try:
expression_tree = sqlglot.parse(code, dialect='postgres')
if expression_tree == [None]:
is_none += 1
correct = True
except sqlglot.errors.ParseError as pe:
correct = False
#print(f'----\nParseError: {pe}\n-----')
except sqlglot.errors.TokenError as te:
correct = False
#print(f'----\nTokenError: {te}\n-----')
except:
correct = False
if correct:
parsed += 1
else:
not_parsed += 1
return (parsed, not_parsed, is_none)
def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None: def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None:
"""Main function. """Main function.
...@@ -88,17 +142,30 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st ...@@ -88,17 +142,30 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
if os.path.isfile(input_filepath_codes): if os.path.isfile(input_filepath_codes):
codes = load_code_sections(input_filepath_codes) codes = load_code_sections(input_filepath_codes)
count = 0
parsed = 0 parsed = 0
not_parsed = 0 not_parsed = 0
is_none = 0
for key, values in codes.items(): for key, values in codes.items():
if count < 1000: if not values:
tmp = analyze(key, values) continue
parsed += tmp[0]
not_parsed += tmp[1] tmp = analyze(key, values)
count += 1 parsed += tmp[0]
print(f'Parsed: {parsed}, not parsed: {not_parsed}') not_parsed += tmp[1]
# Parsed: 931, not parsed: 1802 is_none += tmp[2]
print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed)')
print('DONE!')
# Parsed: 931, not parsed: 1802 (původně)
# Parsed: 1122, not parsed: 2251 (ignor prázdných)
# Parsed: 5189, not parsed: 3887 (split na sekce, bez dialektu postgres)
# Parsed: 9723, not parsed: 6415 (split ;)
# Parsed: 9769, not parsed: 6369 (E' -> ', no `)
# Parsed: 9571, not parsed: 6567 (added dialect='postgres')
# Parsed: 1192213, not parsed: 886185 (all)
# Parsed: 1192213, not parsed: 886185, None: 198696 (included in Parsed)
return return
if os.path.isfile(input_filepath_linked): if os.path.isfile(input_filepath_linked):
...@@ -116,7 +183,6 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st ...@@ -116,7 +183,6 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
link_questions_with_answers(input_filepath_all_answers, get_questions(input_filepath_postgresql_questions), input_filepath_linked) link_questions_with_answers(input_filepath_all_answers, get_questions(input_filepath_postgresql_questions), input_filepath_linked)
#creates input_filepath_codes #creates input_filepath_codes
save_code_sections(input_filepath_linked, input_filepath_codes) save_code_sections(input_filepath_linked, input_filepath_codes)
run(input_filepath_all_answers, input_filepath_postgresql_questions, input_filepath_linked, input_filepath_codes) run(input_filepath_all_answers, input_filepath_postgresql_questions, input_filepath_linked, input_filepath_codes)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -125,9 +191,8 @@ if __name__ == "__main__": ...@@ -125,9 +191,8 @@ if __name__ == "__main__":
""" """
DONE: DONE:
1. config file, soubory do files/ 1. Fix analyze
2. struktura kapitoly, nová kapitola ohledně schémat 2. Fix kapitoly
3. test na větším počtu
"""
#TODO Fix analyze() - PL/pgSQL zahazuju (DECLARE @OuterPageSize int)
\ No newline at end of file """
\ No newline at end of file
...@@ -168,8 +168,18 @@ def tmp(): ...@@ -168,8 +168,18 @@ def tmp():
except sqlglot.errors.OptimizeError: except sqlglot.errors.OptimizeError:
expression_tree = undo expression_tree = undo
if __name__ == '__main__': def whatever():
#print('SQLMETADATA: ') expr = "SELECT hello FROM y;"
#sqlmetadata() expr_split = expr.split(';')
for exp in expr_split:
try:
expression_tree = sqlglot.parse(exp)
print(repr(expression_tree))
if expression_tree == [None]:
print('is None')
except sqlglot.errors.ParseError as pe:
print(f'ParseError: {pe}')
tmp() if __name__ == '__main__':
\ No newline at end of file whatever()
\ No newline at end of file
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment