db_interactor/database/db_adapter.py

import logging
import os
import time

import pandas as pd
import sqlalchemy as sq
import sqlparse

from database.QueryParameters import QueryParameters
from models.DatabaseConfig import DatabaseConfig
from models.DatabaseType import DatabaseType
from keepass.Keepass import KeePass
from models.ExportType import ExportType
from models.Municipality import Municipality


class DBAdapter:
    _engine: sq.Engine
    _database_config: DatabaseConfig
    _has_tables_been_initialized: bool = False
    _logger: logging.Logger
    _output_folder: str = 'output/'

    def __init__(self, keepass: KeePass, database_config: DatabaseConfig, logger: logging.Logger):
        self._database_config = database_config
        connection_string: str
        keepass_entry = keepass.get_db_credentials()
        self._logger = logger

        match self._database_config.type:
            case DatabaseType.PSQL:
                connection_string = f'postgresql+pg8000://{keepass_entry.name}:{keepass_entry.password}@{self._database_config.host}/{self._database_config.name}'
            case DatabaseType.ORCL:
                connection_string = f'oracle+cx_oracle://{keepass_entry.name}:{keepass_entry.password}@{self._database_config.host}:{self._database_config.port}/{self._database_config.ssid}'
            case DatabaseType.SQLITE:
                connection_string = f'sqlite:///{self._database_config.host}'
            case _:
                raise Exception(f'Database type {database_config.type} is not supported')
        logger.info(f'Initializing database {database_config.host}:{database_config.name}')
        self._engine: sq.Engine = sq.create_engine(connection_string)
        logger.info('Database initialized')

    def _set_transaction_readonly(self, conn: sq.Connection):
        self._logger.info('Setting transaction to readonly.')
        if not conn.in_transaction():
            raise Exception('Connection is not in a transaction')

        match self._database_config.type:
            case DatabaseType.PSQL | DatabaseType.ORCL:
                conn.execute(sq.text('SET TRANSACTION READ ONLY'))
            case _:
                raise Exception(
                    f'Database type {self._database_config.type} is not supported for readonly transactions')

    def _set_schema(self, conn: sq.Connection, schema: str):
        self._logger.info(f'Setting schema to "{schema}"')
        if not conn.in_transaction():
            raise Exception('Connection is not in a transaction')

        match self._database_config.type:
            case DatabaseType.ORCL:
                conn.execute(sq.text(f"alter session set current_schema = {schema}"))
            case DatabaseType.PSQL:
                conn.execute(sq.text(f"set schema '{schema}'"))
            case _:
                raise Exception(
                    f'Database type {self._database_config.type} is not supported for readonly transactions')

    def _generate_filename(self, filename: str) -> str:
        try:
            os.mkdir(self._output_folder)
        except FileExistsError:
            pass
        return f'{self._output_folder}{filename}'

    def _export_to_file(self, export_type, output_file_name, result, query_parameter: QueryParameters = None):
        match export_type:
            case ExportType.CSV:
                output_file_name += '.csv'
                result.to_csv(output_file_name, index=False, sep=';', encoding='utf-8')
            case ExportType.EXCEL:
                output_file_name += '.xlsx'
                result.to_excel(output_file_name, index=False)
            case ExportType.XML:
                output_file_name += '.xml'
                result.to_xml(output_file_name, index=False)

        self._logger.info(f'Created file {output_file_name}')

    def _extract_dataframe(self, conn: sq.Connection, query: str, read_only: bool, query_parameters: QueryParameters,
                           schema: str | None = None) -> pd.DataFrame:
        result: pd.DataFrame

        with conn.begin():
            self._logger.info("Starting transaction")
            try:
                if read_only:
                    self._set_transaction_readonly(conn)
                if schema is not None:
                    self._set_schema(conn, schema)
                result = self._extract_dataframe_no_safeguards(conn, query, query_parameters)
            except Exception as e:
                conn.rollback()
                raise e
        return result

    def _extract_dataframe_no_safeguards(self, conn: sq.Connection, query: str,
                                         query_parameter: QueryParameters) -> pd.DataFrame:
        result: pd.DataFrame
        start = time.time()
        result = pd.read_sql(query, conn, params=query_parameter.query_parameters)
        end = time.time()
        self._logger.info(f'Query took {(end - start):.4f} seconds')
        return result

    def run_sql_file_export_to_file_multiple_schemas(self, municipalities: list[Municipality],
                                                     query_parameter: QueryParameters,
                                                     read_only=True):
        self.run_sql_files_export_to_files_multiple_schemas(municipalities, [query_parameter], read_only)

    def run_sql_file_multiple_statements(self, query_parameter: QueryParameters):
        """
        Runs an SQL file, supports multiple statements, does not support plsql.
        If any statements fail, throws an error and rolls back.
        :param query_parameter: contains data about the queries to run and how to find them
        :return: Nothing
        """
        queries = query_parameter.get_queries()
        self._logger.info(queries)
        self._logger.info(f'Running {len(queries)} queries')
        with self._engine.connect() as conn:
            with conn.begin():
                self._logger.info("Starting transaction")
                try:
                    for index, query in enumerate(queries):
                        start = time.time()
                        conn.execute(sq.text(query), parameters=query_parameter.query_parameters)
                        end = time.time()
                        self._logger.info(
                            f'({index + 1} / {len(queries)}) Query took {(end - start):.4f} seconds ({query})')
                    conn.commit()
                except Exception as e:
                    self._logger.info(f'Transaction rollback')
                    conn.rollback()
                    raise e
        self._logger.info(f'Transaction commited')

    def run_sql_files_export_to_files_multiple_schemas(self, municipalities: list[Municipality],
                                                       query_parameters: list[QueryParameters] = None,
                                                       read_only: bool = True):
        """"
        Runs the list of granted sql files against the list of municipalities
        :param export_type: the type of files to export
        :param municipalities: a list of municipalities
        :param query_parameters: a list of sql files to run TODO: make this a pair list with sql file and translation for root_name and row_name to give the XML file the correct namespaces, consider using the stylesheet option from panda to xml https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_xml.html
        :param read_only: if the transaction should be set too read-only to avoid changes to the database
        :return: Nothing
        """
        with self._engine.connect() as conn:
            with conn.begin():
                if read_only:
                    self._set_transaction_readonly(conn)

                for municipality_index, municipality in enumerate(municipalities):
                    self._logger.info(
                        f'({municipality_index + 1}/{len(municipalities)}) Starting to process municipality {municipality.name} ({municipality.schema})')
                    self._set_schema(conn, municipality.schema)
                    file_prefix = f'{municipality.kommunekode}/'

                    for query_file_index, query_parameter in enumerate(query_parameters):
                        queries = query_parameter.get_queries()
                        self._logger.info(
                            f'({query_file_index + 1}/{len(municipalities)}) Starting to process query with title: {query_parameter.title}')

                        if not len(queries) != 1:
                            self._logger.error(f'Query file {query_parameter.title} failed due to multiple queries')
                            raise Exception(f'Query file {query_parameter.title} failed due to multiple queries')

                        query = queries[0]
                        dataframe = self._extract_dataframe_no_safeguards(conn, query, query_parameter)
                        self._logger.info(
                            f'[{municipality.kommunekode}][{query_parameter.title}][{len(dataframe.index)}]')
                        filename = self._generate_filename(f'{file_prefix}{query_parameter.title}')

                        self._export_to_file(query_parameter.export_type, filename, dataframe, query_parameter)
                        self._logger.info(f'Finished processing {query_parameter.title} generated file {filename}')