diff --git a/BachelorThesis.pdf b/BachelorThesis.pdf index 226441b206df9b1f09a10d98b69f97d195d1ae95..4560b8cf9a623b28eee6d5b8643048c3594e623a 100644 Binary files a/BachelorThesis.pdf and b/BachelorThesis.pdf differ diff --git a/main.py b/main.py index fe545ad04652afde751c1b06b8369f2d4e72106e..bad145d31479c8cc56bee4246e9e7bb520068105 100644 --- a/main.py +++ b/main.py @@ -1,3 +1,5 @@ +import sqlglot.dialects +import sqlglot.dialects.dialect import sqlglot.optimizer import sqlglot.optimizer.qualify from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections @@ -26,8 +28,54 @@ def strip_tags(html): s.feed(html) return s.get_data() -def erase_html(code_section: str) -> str: - result = strip_tags(' '.join(code_section)) +def replace_unrecognized_letters(result: str) -> str: + result = result.replace('E\'', '\'') + result = result.replace('`', '') + result = result.replace('Ĺ‚', 'l') + result = result.replace('ø', 'o') + if 'psql:latest.dump:1601: invalid command' in result: + return '' + result = result.replace('ÂŁ', '$') + result = result.replace('è', 'e').replace('Ă©', 'e').replace('É', 'e') + result = result.replace('Ă', 'i') + result = result.replace('ç', 'c') + result = result.replace('õ', 'o') + if '<?php if($funcao==1 )' in result: + return '' + result = result.replace('á', 'a') + result = result.replace('Ăł', 'o') + result = result.replace('Ĺ„', 'n') + result = result.replace('ñ', 'n') + if 'else { $GLOBALS[komunikat_edycja_agenta' in result: + return '' + result = result.replace('ĂŁ', 'a') + result = result.replace('©', 'c') + result = result.replace('ĂĽ', 'u') + result = result.replace('Ă—', 'x') + result = result.replace('Ăş', 'u') + result = result.replace('ĂŞ', 'e') + result = result.replace('«', '').replace('»', '') + result = result.replace('Ă ', 'a') + result = result.replace('Ń„', '') + result = result.replace('Ă´', 'o') + result = result.replace('ä', 'a') + result = result.replace('§', '$') + result = result.replace('Ăš', 'U') + result = result.replace('ö', 'o') + result = result.replace('â', 'a') + result = result.replace('Âş', '') + result = result.replace('¡', '') + result = result.replace('ČĽ', '') + result = result.replace('Ăź', 's') + result = result.replace('Ń', 'c') + result = result.replace('С', 'C') + result = result.replace('Ă‚', 'A') + result = result.replace('Ă·', '').replace('®', '').replace('¤', '').replace('¦', '') + + return result + +def erase_html(code_section: list) -> str: + result = strip_tags(';'.join(code_section)) pattern = r"\\+['a-zA-Z]" matches = re.findall(pattern, result) @@ -43,34 +91,40 @@ def erase_html(code_section: str) -> str: result = result.replace(match, '\n') else: result = result.replace(match, match[-2:]) - - return result - + return replace_unrecognized_letters(result) def analyze(id: str, codes: list) -> tuple: parsed = 0 not_parsed = 0 - for code in codes: - code = erase_html(code) - - correct = False - try: - expression_tree = sqlglot.parse(code, dialect='postgres') - correct = True - except sqlglot.errors.ParseError: - correct = False - except sqlglot.errors.TokenError: - correct = False - except: - correct = False - - if correct: - parsed += 1 - else: - not_parsed += 1 - return (parsed, not_parsed) + is_none = 0 + for code_list in codes: + all_codes_string = erase_html(code_list) + for code in all_codes_string.split(';'): + #print('----------------------') + #print(code) + correct = False + try: + expression_tree = sqlglot.parse(code, dialect='postgres') + if expression_tree == [None]: + is_none += 1 + correct = True + except sqlglot.errors.ParseError as pe: + correct = False + #print(f'----\nParseError: {pe}\n-----') + except sqlglot.errors.TokenError as te: + correct = False + #print(f'----\nTokenError: {te}\n-----') + except: + correct = False + + if correct: + parsed += 1 + else: + not_parsed += 1 + + return (parsed, not_parsed, is_none) def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None: """Main function. @@ -88,17 +142,30 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st if os.path.isfile(input_filepath_codes): codes = load_code_sections(input_filepath_codes) - count = 0 parsed = 0 not_parsed = 0 + is_none = 0 + for key, values in codes.items(): - if count < 1000: - tmp = analyze(key, values) - parsed += tmp[0] - not_parsed += tmp[1] - count += 1 - print(f'Parsed: {parsed}, not parsed: {not_parsed}') - # Parsed: 931, not parsed: 1802 + if not values: + continue + + tmp = analyze(key, values) + parsed += tmp[0] + not_parsed += tmp[1] + is_none += tmp[2] + + print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed)') + print('DONE!') + # Parsed: 931, not parsed: 1802 (pĹŻvodnÄ›) + # Parsed: 1122, not parsed: 2251 (ignor prázdnĂ˝ch) + # Parsed: 5189, not parsed: 3887 (split na sekce, bez dialektu postgres) + # Parsed: 9723, not parsed: 6415 (split ;) + # Parsed: 9769, not parsed: 6369 (E' -> ', no `) + + # Parsed: 9571, not parsed: 6567 (added dialect='postgres') + # Parsed: 1192213, not parsed: 886185 (all) + # Parsed: 1192213, not parsed: 886185, None: 198696 (included in Parsed) return if os.path.isfile(input_filepath_linked): @@ -116,7 +183,6 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st link_questions_with_answers(input_filepath_all_answers, get_questions(input_filepath_postgresql_questions), input_filepath_linked) #creates input_filepath_codes save_code_sections(input_filepath_linked, input_filepath_codes) - run(input_filepath_all_answers, input_filepath_postgresql_questions, input_filepath_linked, input_filepath_codes) if __name__ == "__main__": @@ -125,9 +191,8 @@ if __name__ == "__main__": """ DONE: - 1. config file, soubory do files/ - 2. struktura kapitoly, nová kapitola ohlednÄ› schĂ©mat - 3. test na vÄ›tšĂm poÄŤtu -""" + 1. Fix analyze + 2. Fix kapitoly -#TODO Fix analyze() \ No newline at end of file + - PL/pgSQL zahazuju (DECLARE @OuterPageSize int) +""" \ No newline at end of file diff --git a/tests.py b/tests.py index deefc4b4af61a92f18578823209d6b2679c4dc79..404317d652ff5e79ba8027eb163d6334cdc81bb9 100644 --- a/tests.py +++ b/tests.py @@ -168,8 +168,18 @@ def tmp(): except sqlglot.errors.OptimizeError: expression_tree = undo -if __name__ == '__main__': - #print('SQLMETADATA: ') - #sqlmetadata() +def whatever(): + expr = "SELECT hello FROM y;" + expr_split = expr.split(';') + for exp in expr_split: + try: + expression_tree = sqlglot.parse(exp) + print(repr(expression_tree)) + if expression_tree == [None]: + print('is None') + + except sqlglot.errors.ParseError as pe: + print(f'ParseError: {pe}') - tmp() \ No newline at end of file +if __name__ == '__main__': + whatever() \ No newline at end of file