dataset, kapitola

93dd4560 · Mesharo · 75dab8ba · 93dd4560 · 93dd4560 · 93dd4560
Commit 93dd4560 authored 8 months ago by Mesharo
--- a/BachelorThesis.pdf
+++ b/BachelorThesis.pdf
--- a/handling_dataset/main.py
+++ b/handling_dataset/main.py
@@ -89,12 +89,24 @@ def is_question_tagged_postgresql(postgresql_tags: list, question_tags: str) ->
    question_tags -- string of the current question's tags.
    """
    
-    correct = False
-    for tag in postgresql_tags:
-        if tag in question_tags:
-            correct = True
+    question_tags_split = re.findall('<(.+?)>', question_tags)
+
+    found_sql_tag = False
+    for question_tag in question_tags_split:
+        if question_tag == 'sql':
+            found_sql_tag = True
            break
-    return correct
+
+    if not found_sql_tag:
+        return False
+    
+    question_tags_split.remove('sql')
+    
+    for question_tag in question_tags_split:
+        for postgresql_tag in postgresql_tags:
+            if question_tag == postgresql_tag:
+                return True
+    return False

 def filter_postgresql_questions(input_filepath_posts_xml: str, input_filepath_tags_xml: str, output_filepath_questions: str, output_filepath_answers: str) -> None:
    """Filter out postgresql questions.
@@ -326,4 +338,11 @@ def load_code_sections(input_filepath_codes: str) -> dict:
            tmp = eval(line.rstrip())
            result[tmp[0]] = tmp[1]

-    return result
\ No newline at end of file
+    return result
+
+
+
+if __name__ == '__main__':
+    #filter_postgresql_questions('D:\\stackoverflow.com\\Posts.xml', 'D:\\stackoverflow.com\\Tags.xml', 'D:\\postgresql_questions.txt', 'D:\\all_answers.txt')
+    #get_postgresql_tags('D:\\stackoverflow.com\\Tags.xml')
+    pass
\ No newline at end of file
--- a/main.py
+++ b/main.py
 import sqlglot.dialects
 import sqlglot.dialects.dialect
+import sqlglot.errors
 import sqlglot.optimizer
 import sqlglot.optimizer.qualify
 from handling_dataset.main import filter_postgresql_questions, load_code_sections, get_questions, link_questions_with_answers, save_code_sections
 import re
 import os.path
 import sqlglot
-import copy
 import config

 from io import StringIO
@@ -98,6 +98,7 @@ def analyze(id: str, codes: list) -> tuple:
    parsed = 0
    not_parsed = 0
    is_none = 0
+    not_empty_columns_tables_aliases = 0
    for code_list in codes:
        all_codes_string = erase_html(code_list)
        for code in all_codes_string.split(';'):
@@ -109,6 +110,24 @@ def analyze(id: str, codes: list) -> tuple:
                expression_tree = sqlglot.parse(code, dialect='postgres')
                if expression_tree == [None]:
                    is_none += 1
+                    correct = True
+                    continue
+                
+                for ast in expression_tree:
+                    columns = ast.find_all(sqlglot.exp.Column)
+                    tables = ast.find_all(sqlglot.exp.Table)
+                    aliases = ast.find_all(sqlglot.exp.Alias)
+                    
+                    if next(columns, None) is not None:
+                        not_empty_columns_tables_aliases += 1
+                        continue
+                    if next(tables, None) is not None:
+                        not_empty_columns_tables_aliases += 1
+                        continue
+                    if next(aliases, None) is not None:
+                        not_empty_columns_tables_aliases += 1
+                        continue
+
                correct = True
            except sqlglot.errors.ParseError as pe:
                correct = False
@@ -116,6 +135,9 @@ def analyze(id: str, codes: list) -> tuple:
            except sqlglot.errors.TokenError as te:
                correct = False
                #print(f'----\nTokenError: {te}\n-----')
+            except sqlglot.errors.OptimizeError as oe:
+                correct = False
+                #print(f'----\nOptimizeError: {oe}\n-----')
            except:
                correct = False

@@ -124,7 +146,7 @@ def analyze(id: str, codes: list) -> tuple:
            else:
                not_parsed += 1   

-    return (parsed, not_parsed, is_none)
+    return (parsed, not_parsed, is_none, not_empty_columns_tables_aliases)

 def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: str, input_filepath_linked: str, input_filepath_codes: str) -> None:
    """Main function.
@@ -145,6 +167,7 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
        parsed = 0
        not_parsed = 0
        is_none = 0
+        not_empty_columns_tables_aliases = 0

        for key, values in codes.items():
            if not values:
@@ -154,18 +177,12 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st
            parsed += tmp[0]
            not_parsed += tmp[1]
            is_none += tmp[2]
+            not_empty_columns_tables_aliases += tmp[3]
            
-        print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed)')
+        print(f'Parsed: {parsed}, not parsed: {not_parsed}, None: {is_none} (included in Parsed), Found col/table/alias: {not_empty_columns_tables_aliases}')
        print('DONE!')
-        # Parsed: 931, not parsed: 1802 (původně)
-        # Parsed: 1122, not parsed: 2251 (ignor prázdných)
-        # Parsed: 5189, not parsed: 3887 (split na sekce, bez dialektu postgres)
-        # Parsed: 9723, not parsed: 6415 (split ;)
-        # Parsed: 9769, not parsed: 6369 (E' -> ', no `)
-
-        # Parsed: 9571, not parsed: 6567 (added dialect='postgres')
-        # Parsed: 1192213, not parsed: 886185 (all)
-        # Parsed: 1192213, not parsed: 886185, None: 198696 (included in Parsed)
+        # new dataset: Parsed: 424400, not parsed: 201498, None: 80004 (included in Parsed)
+        # Parsed: 344396, not parsed: 201498, None: 80004 (not included in Parsed), Found col/table/alias: 280418
        return

    if os.path.isfile(input_filepath_linked):
@@ -187,12 +204,4 @@ def run(input_filepath_all_answers: str, input_filepath_postgresql_questions: st

 if __name__ == "__main__":
    run(config.filepaths['all_answers'], config.filepaths['postgresql_questions'], config.filepaths['linked'], config.filepaths['codes'])
-    
-
-"""
-DONE:
-    1. Fix analyze
-    2. Fix kapitoly
-
-    - PL/pgSQL zahazuju (DECLARE @OuterPageSize int)
-"""
\ No newline at end of file
+    
\ No newline at end of file
--- a/tests.py
+++ b/tests.py
@@ -78,6 +78,7 @@ def testing2():

    print(mydict)

+import sqlglot.optimizer.scope
 import sqlparse
 def erasing_backslashes():
    xd = """SELECT \\\'ALTER TABLE "\\\'||nspname||\\\'"."\\\'||relname||\\\'" DROP CONSTRAINT "\\\'||conname||\\\'";\\\'
@@ -118,6 +119,7 @@ def sqlmetadata():
 import sqlglot
 import sqlglot.optimizer
 import sqlglot.optimizer.qualify
+from sqlglot import exp
 def sqlglottry():
    for column in sqlglot.parse_one("""
 insert into EscapeTest (text) values (E'This is the first part 
@@ -169,17 +171,65 @@ def tmp():
        expression_tree = undo

 def whatever():
-    expr = "SELECT hello FROM y;"
-    expr_split = expr.split(';')
-    for exp in expr_split:
-        try:
-            expression_tree = sqlglot.parse(exp)
-            print(repr(expression_tree))
-            if expression_tree == [None]:
-                print('is None')
-
-        except sqlglot.errors.ParseError as pe:
-            print(f'ParseError: {pe}')
+    expr = "SELECT xd.hello FROM y AS xd"
+    try:
+        expression_tree = sqlglot.parse(expr)
+        print(repr(expression_tree))
+                
+        for tmp in expression_tree:
+            columns = tmp.find_all(exp.Column)
+            tables = tmp.find_all(exp.Table)
+            aliases = tmp.find_all(exp.Alias)
+
+            print('Columns!')
+            for column in columns:
+                print(column.name)
+
+            print('Tables!')
+            for table in tables:
+                print(table.name)
+
+            print('Aliases!')
+            for alias in aliases:
+                print(alias.name)
+        
+        """
+        for node in expression_tree.args['expressions']:
+            if isinstance(node, exp.Column):
+                if (node.args['this']):
+                    print(f'Column name: {node.args["this"]}')
+                if (node.args['table']):
+                    print(f'from table: {node.args["table"]}')
+            
+            if isinstance(node, exp.Alias):
+                if (node.args['this']):
+                    print(f'Column name: {node.args["this"]}')
+
+            if isinstance(node, exp.Table):
+                if (node.args['this']):
+                    print(f'Column name: {node.args["this"]}')
+                if (node.args['alias']):
+                    print(f'from table: {node.args["alias"]}')
+        """
+
+    except sqlglot.errors.ParseError as pe:
+        print(f'ParseError: {pe}')
+
+def test_qualify():
+    statement = "INSERT INTO first VALUES((SELECT hello FROM second), world);"
+
+    try:
+        ASTs = sqlglot.parse(statement)
+        print(repr(ASTs))
+        for AST in ASTs:
+            root = sqlglot.optimizer.scope.build_scope(AST)
+            print(root)
+            for scope in root.traverse():
+                print(scope)
+    except sqlglot.errors.OptimizeError as oe:
+        print(f'----\nOptimizeError: {oe}\n-----')
+

 if __name__ == '__main__':
-    whatever()
\ No newline at end of file
+    #whatever()
+    test_qualify()
\ No newline at end of file