Module `pipelines.utils.dump_db.tasks`

General purpose tasks for dumping database data.

Functions

def database_execute(database: Database, query: str, wait=None, flow_name: str = None, labels: List[str] = None, dataset_id: str = None, table_id: str = None) ‑> None

Expand source code

@task(
    checkpoint=False,
    max_retries=constants.TASK_MAX_RETRIES.value,
    retry_delay=timedelta(seconds=constants.TASK_RETRY_DELAY.value),
)
def database_execute(
    database: Database,
    query: str,
    wait=None,  # pylint: disable=unused-argument
    flow_name: str = None,
    labels: List[str] = None,
    dataset_id: str = None,
    table_id: str = None,
) -> None:
    """
    Executes a query on the database.

    Args:
        database: The database object.
        query: The query to execute.
    """
    start_time = time()
    log(f"Query parsed: {query}")
    query = remove_tabs_from_query(query)
    log(f"Executing query line: {query}")
    database.execute_query(query)
    time_elapsed = time() - start_time
    doc = format_document(
        flow_name=flow_name,
        labels=labels,
        event_type="db_execute",
        dataset_id=dataset_id,
        table_id=table_id,
        metrics={"db_execute": time_elapsed},
    )
    index_document(doc)

Executes a query on the database.

Args

database: The database object.
query: The query to execute.

def database_fetch(database: Database, batch_size: str, wait=None, flow_name: str = None, labels: List[str] = None, dataset_id: str = None, table_id: str = None)

Expand source code

@task(
    checkpoint=False,
    max_retries=constants.TASK_MAX_RETRIES.value,
    retry_delay=timedelta(seconds=constants.TASK_RETRY_DELAY.value),
)
def database_fetch(
    database: Database,
    batch_size: str,
    wait=None,  # pylint: disable=unused-argument
    flow_name: str = None,
    labels: List[str] = None,
    dataset_id: str = None,
    table_id: str = None,
):
    """
    Fetches the results of a query on the database.
    """
    start_time = time()
    if batch_size == "all":
        log(f"columns: {database.get_columns()}")
        log(f"All rows: { database.fetch_all()}")
    else:
        try:
            batch_size_no = int(batch_size)
        except ValueError as error:
            raise ValueError(f"Invalid batch size: {batch_size}") from error
        log(f"columns: {database.get_columns()}")
        log(f"{batch_size_no} rows: {database.fetch_batch(batch_size_no)}")
    time_elapsed = time() - start_time
    doc = format_document(
        flow_name=flow_name,
        labels=labels,
        event_type="db_fetch",
        dataset_id=dataset_id,
        table_id=table_id,
        metrics={"db_fetch": time_elapsed},
    )
    index_document(doc)

Fetches the results of a query on the database.

def database_get(database_type: str, hostname: str, port: int, user: str, password: str, database: str, wait=None) ‑> Database

Expand source code

@task(
    checkpoint=False,
    max_retries=constants.TASK_MAX_RETRIES.value,
    retry_delay=timedelta(seconds=constants.TASK_RETRY_DELAY.value),
)
def database_get(
    database_type: str,
    hostname: str,
    port: int,
    user: str,
    password: str,
    database: str,
    wait=None,  # pylint: disable=unused-argument
) -> Database:
    """
    Returns a database object.

    Args:
        database_type: The type of the database.
        hostname: The hostname of the database.
        port: The port of the database.
        user: The username of the database.
        password: The password of the database.
        database: The database name.

    Returns:
        A database object.
    """
    if database_type not in DATABASE_MAPPING:
        raise ValueError(f"Unknown database type: {database_type}")
    return DATABASE_MAPPING[database_type](
        hostname=hostname,
        port=port,
        user=user,
        password=password,
        database=database,
    )

Returns a database object.

Args

database_type: The type of the database.
hostname: The hostname of the database.
port: The port of the database.
user: The username of the database.
password: The password of the database.
database: The database name.

Returns

A database object.

def dump_batches_to_file(database: Database, batch_size: int, prepath: str | pathlib.Path, partition_columns: List[str] = None, batch_data_type: str = 'csv', wait=None, flow_name: str = None, labels: List[str] = None, dataset_id: str = None, table_id: str = None) ‑> pathlib.Path

Expand source code

@task(
    max_retries=constants.TASK_MAX_RETRIES.value,
    retry_delay=timedelta(seconds=constants.TASK_RETRY_DELAY.value),
    nout=2,
)
def dump_batches_to_file(  # pylint: disable=too-many-locals,too-many-statements
    database: Database,
    batch_size: int,
    prepath: Union[str, Path],
    partition_columns: List[str] = None,
    batch_data_type: str = "csv",
    wait=None,  # pylint: disable=unused-argument
    flow_name: str = None,
    labels: List[str] = None,
    dataset_id: str = None,
    table_id: str = None,
) -> Path:
    """
    Dumps batches of data to FILE.
    """
    # Get columns
    columns = database.get_columns()
    log(f"Got columns: {columns}")

    new_query_cols = build_query_new_columns(table_columns=columns)
    log("New query columns without accents:")
    log(f"{new_query_cols}")

    prepath = Path(prepath)
    log(f"Got prepath: {prepath}")

    if not partition_columns or partition_columns[0] == "":
        partition_column = None
    else:
        partition_column = partition_columns[0]

    if not partition_column:
        log("NO partition column specified! Writing unique files")
    else:
        log(f"Partition column: {partition_column} FOUND!! Write to partitioned files")

    # Initialize queues
    batches = Queue()
    dataframes = Queue()

    # Define thread functions
    def thread_batch_to_dataframe(
        batches: Queue,
        dataframes: Queue,
        done: Event,
        columns: List[str],
        flow_name: str,
        labels: List[str],
        dataset_id: str,
        table_id: str,
    ):
        while not done.is_set():
            try:
                batch = batches.get(timeout=1)
                start_time = time()
                dataframe = batch_to_dataframe(batch, columns)
                elapsed_time = time() - start_time
                dataframes.put(dataframe)
                doc = format_document(
                    flow_name=flow_name,
                    labels=labels,
                    event_type="batch_to_dataframe",
                    dataset_id=dataset_id,
                    table_id=table_id,
                    metrics={"batch_to_dataframe": elapsed_time},
                )
                index_document(doc)
                batches.task_done()
            except Empty:
                sleep(1)

    def thread_dataframe_to_csv(
        dataframes: Queue,
        done: Event,
        flow_name: str,
        labels: List[str],
        dataset_id: str,
        table_id: str,
        partition_column: str,
        partition_columns: List[str],
        prepath: Path,
        batch_data_type: str,
        eventid: str,
    ):
        while not done.is_set():
            try:
                # Get dataframe from queue
                dataframe: pd.DataFrame = dataframes.get(timeout=1)
                # Clean dataframe
                start_time = time()
                old_columns = dataframe.columns.tolist()
                dataframe.columns = remove_columns_accents(dataframe)
                new_columns_dict = dict(zip(old_columns, dataframe.columns.tolist()))
                dataframe = clean_dataframe(dataframe)
                elapsed_time = time() - start_time
                doc = format_document(
                    flow_name=flow_name,
                    labels=labels,
                    event_type="clean_dataframe",
                    dataset_id=dataset_id,
                    table_id=table_id,
                    metrics={"clean_dataframe": elapsed_time},
                )
                index_document(doc)
                # Dump dataframe to file
                start_time = time()
                if partition_column:
                    dataframe, date_partition_columns = parse_date_columns(
                        dataframe, new_columns_dict[partition_column]
                    )
                    partitions = date_partition_columns + [
                        new_columns_dict[col] for col in partition_columns[1:]
                    ]
                    to_partitions(
                        data=dataframe,
                        partition_columns=partitions,
                        savepath=prepath,
                        data_type=batch_data_type,
                    )
                elif batch_data_type == "csv":
                    dataframe_to_csv(dataframe, prepath / f"{eventid}-{uuid4()}.csv")
                elif batch_data_type == "parquet":
                    dataframe_to_parquet(
                        dataframe, prepath / f"{eventid}-{uuid4()}.parquet"
                    )
                elapsed_time = time() - start_time
                doc = format_document(
                    flow_name=flow_name,
                    labels=labels,
                    event_type=f"batch_to_{batch_data_type}",
                    dataset_id=dataset_id,
                    table_id=table_id,
                    metrics={f"batch_to_{batch_data_type}": elapsed_time},
                )
                index_document(doc)
                dataframes.task_done()
            except Empty:
                sleep(1)

    # Initialize threads
    done = Event()
    eventid = datetime.now().strftime("%Y%m%d-%H%M%S")
    worker_batch_to_dataframe = Thread(
        target=thread_batch_to_dataframe,
        args=(
            batches,
            dataframes,
            done,
            columns,
            flow_name,
            labels,
            dataset_id,
            table_id,
        ),
    )
    worker_dataframe_to_csv = Thread(
        target=thread_dataframe_to_csv,
        args=(
            dataframes,
            done,
            flow_name,
            labels,
            dataset_id,
            table_id,
            partition_column,
            partition_columns,
            prepath,
            batch_data_type,
            eventid,
        ),
    )
    worker_batch_to_dataframe.start()
    worker_dataframe_to_csv.start()

    # Dump batches
    start_fetch_batch = time()
    batch = database.fetch_batch(batch_size)
    time_fetch_batch = time() - start_fetch_batch
    doc = format_document(
        flow_name=flow_name,
        labels=labels,
        event_type="fetch_batch",
        dataset_id=dataset_id,
        table_id=table_id,
        metrics={"fetch_batch": time_fetch_batch},
    )
    index_document(doc)
    idx = 0
    while len(batch) > 0:
        if idx % 100 == 0:
            log(f"Dumping batch {idx} with size {len(batch)}")
        # Add current batch to queue
        batches.put(batch)
        # Get next batch
        start_fetch_batch = time()
        batch = database.fetch_batch(batch_size)
        time_fetch_batch = time() - start_fetch_batch
        doc = format_document(
            flow_name=flow_name,
            labels=labels,
            event_type="fetch_batch",
            dataset_id=dataset_id,
            table_id=table_id,
            metrics={"fetch_batch": time_fetch_batch},
        )
        index_document(doc)
        idx += 1

    log("Waiting for batches queue...")
    start_sleep = 1
    max_sleep = 300
    while batches.unfinished_tasks > 0:
        sleep(start_sleep)
        start_sleep = min(start_sleep * 2, max_sleep)
        log(
            f"Waiting for {batches.unfinished_tasks} batches to be parsed as dataframes..."
        )
    batches.join()
    start_sleep = 1
    log("Waiting for dataframes queue...")
    while dataframes.unfinished_tasks > 0:
        sleep(start_sleep)
        start_sleep = min(start_sleep * 2, max_sleep)
        log(f"Waiting for {dataframes.unfinished_tasks} dataframes to be dumped...")
    dataframes.join()
    done.set()

    log("Waiting for threads to finish...")
    worker_batch_to_dataframe.join()
    worker_dataframe_to_csv.join()

    log(
        f"Successfully dumped {idx} batches with size {len(batch)}, total of {idx*batch_size}"
    )

    return prepath, idx

Dumps batches of data to FILE.

def dump_upload_batch(database: Database, batch_size: int, dataset_id: str, table_id: str, dump_mode: str, partition_columns: List[str] = None, batch_data_type: str = 'csv', biglake_table: bool = True, log_number_of_batches: int = 100)

Expand source code

@task
def dump_upload_batch(
    database: Database,
    batch_size: int,
    dataset_id: str,
    table_id: str,
    dump_mode: str,
    partition_columns: List[str] = None,
    batch_data_type: str = "csv",
    biglake_table: bool = True,
    log_number_of_batches: int = 100,
):
    """
    This task will dump and upload batches of data, sequentially.
    """
    # Log BD version
    bd_version = bd.__version__
    log(f"Using basedosdados@{bd_version}")

    # Keep track of cleared stuff
    prepath = f"data/{uuid4()}/"
    cleared_partitions = set()
    cleared_table = False

    # Get data columns
    columns = database.get_columns()
    log(f"Got columns: {columns}")

    new_query_cols = build_query_new_columns(table_columns=columns)
    log(f"New query columns without accents: {new_query_cols}")

    prepath = Path(prepath)
    log(f"Got prepath: {prepath}")

    if not partition_columns or partition_columns[0] == "":
        partition_column = None
    else:
        partition_column = partition_columns[0]

    if not partition_column:
        log("NO partition column specified! Writing unique files")
    else:
        log(f"Partition column: {partition_column} FOUND!! Write to partitioned files")

    # Now loop until we have no more data.
    batch = database.fetch_batch(batch_size)
    idx = 0
    while len(batch) > 0:
        # Log progress each 100 batches.
        log_mod(
            msg=f"Dumping batch {idx} with size {len(batch)}",
            index=idx,
            mod=log_number_of_batches,
        )

        # Dump batch to file.
        dataframe = batch_to_dataframe(batch, columns)
        old_columns = dataframe.columns.tolist()
        dataframe.columns = remove_columns_accents(dataframe)
        new_columns_dict = dict(zip(old_columns, dataframe.columns.tolist()))
        dataframe = clean_dataframe(dataframe)
        saved_files = []
        if partition_column:
            dataframe, date_partition_columns = parse_date_columns(
                dataframe, new_columns_dict[partition_column]
            )
            partitions = date_partition_columns + [
                new_columns_dict[col] for col in partition_columns[1:]
            ]
            saved_files = to_partitions(
                data=dataframe,
                partition_columns=partitions,
                savepath=prepath,
                data_type=batch_data_type,
                suffix=f"{datetime.now().strftime('%Y%m%d-%H%M%S')}",
            )
        elif batch_data_type == "csv":
            fname = prepath / f"{uuid4()}.csv"
            dataframe_to_csv(dataframe, fname)
            saved_files = [fname]
        elif batch_data_type == "parquet":
            fname = prepath / f"{uuid4()}.parquet"
            dataframe_to_parquet(dataframe, fname)
            saved_files = [fname]
        else:
            raise ValueError(f"Unknown data type: {batch_data_type}")

        # Log progress each 100 batches.

        log_mod(
            msg=f"Batch generated {len(saved_files)} files. Will now upload.",
            index=idx,
            mod=log_number_of_batches,
        )

        # Upload files.
        tb = bd.Table(dataset_id=dataset_id, table_id=table_id)
        table_staging = f"{tb.table_full_name['staging']}"
        st = bd.Storage(dataset_id=dataset_id, table_id=table_id)
        storage_path = f"{st.bucket_name}.staging.{dataset_id}.{table_id}"
        storage_path_link = (
            f"https://console.cloud.google.com/storage/browser/{st.bucket_name}"
            f"/staging/{dataset_id}/{table_id}"
        )
        dataset_is_public = tb.client["bigquery_prod"].project == "datario"
        # If we have a partition column
        if partition_column:
            # Extract the partition from the filenames
            partitions = []
            for saved_file in saved_files:
                # Remove the prepath and filename. This is the partition.
                partition = str(saved_file).replace(str(prepath), "")
                partition = partition.replace(saved_file.name, "")
                # Strip slashes from beginning and end.
                partition = partition.strip("/")
                # Add to list.
                partitions.append(partition)
            # Remove duplicates.
            partitions = list(set(partitions))
            log_mod(
                msg=f"Got partitions: {partitions}",
                index=idx,
                mod=log_number_of_batches,
            )
            # Loop through partitions and delete files from GCS.
            blobs_to_delete = []
            for partition in partitions:
                if partition not in cleared_partitions:
                    blobs = list_blobs_with_prefix(
                        bucket_name=st.bucket_name,
                        prefix=f"staging/{dataset_id}/{table_id}/{partition}",
                    )
                    blobs_to_delete.extend(blobs)
                cleared_partitions.add(partition)
            if blobs_to_delete:
                delete_blobs_list(bucket_name=st.bucket_name, blobs=blobs_to_delete)
                log_mod(
                    msg=f"Deleted {len(blobs_to_delete)} blobs from GCS: {blobs_to_delete}",
                    index=idx,
                    mod=log_number_of_batches,
                )
        if dump_mode == "append":
            if tb.table_exists(mode="staging"):
                log_mod(
                    msg=(
                        "MODE APPEND: Table ALREADY EXISTS:"
                        + f"\n{table_staging}"
                        + f"\n{storage_path_link}"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )
            else:
                # the header is needed to create a table when dosen't exist
                log_mod(
                    msg="MODE APPEND: Table DOESN'T EXISTS\nStart to CREATE HEADER file",
                    index=idx,
                    mod=log_number_of_batches,
                )
                header_path = dump_header_to_file(data_path=saved_files[0])
                log_mod(
                    msg="MODE APPEND: Created HEADER file:\n" f"{header_path}",
                    index=idx,
                    mod=log_number_of_batches,
                )

                tb.create(
                    path=header_path,
                    if_storage_data_exists="replace",
                    if_table_exists="replace",
                    biglake_table=biglake_table,
                    dataset_is_public=dataset_is_public,
                )

                log_mod(
                    msg=(
                        "MODE APPEND: Sucessfully CREATED A NEW TABLE:\n"
                        + f"{table_staging}\n"
                        + f"{storage_path_link}"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )  # pylint: disable=C0301

                if not cleared_table:
                    st.delete_table(
                        mode="staging",
                        bucket_name=st.bucket_name,
                        not_found_ok=True,
                    )
                    log_mod(
                        msg=(
                            "MODE APPEND: Sucessfully REMOVED HEADER DATA from Storage:\n"
                            + f"{storage_path}\n"
                            + f"{storage_path_link}"
                        ),
                        index=idx,
                        mod=log_number_of_batches,
                    )  # pylint: disable=C0301
                    cleared_table = True
        elif dump_mode == "overwrite":
            if tb.table_exists(mode="staging") and not cleared_table:
                log_mod(
                    msg=(
                        "MODE OVERWRITE: Table ALREADY EXISTS, DELETING OLD DATA!\n"
                        + f"{storage_path}\n"
                        + f"{storage_path_link}"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )  # pylint: disable=C0301
                st.delete_table(
                    mode="staging", bucket_name=st.bucket_name, not_found_ok=True
                )
                log_mod(
                    msg=(
                        "MODE OVERWRITE: Sucessfully DELETED OLD DATA from Storage:\n"
                        + f"{storage_path}\n"
                        + f"{storage_path_link}"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )  # pylint: disable=C0301
                # delete only staging table and let DBT overwrite the prod table
                tb.delete(mode="staging")
                log_mod(
                    msg=(
                        "MODE OVERWRITE: Sucessfully DELETED TABLE:\n"
                        + f"{table_staging}\n"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )  # pylint: disable=C0301

            if not cleared_table:
                # the header is needed to create a table when dosen't exist
                # in overwrite mode the header is always created
                st.delete_table(
                    mode="staging", bucket_name=st.bucket_name, not_found_ok=True
                )
                log_mod(
                    msg=(
                        "MODE OVERWRITE: Sucessfully DELETED OLD DATA from Storage:\n"
                        + f"{storage_path}\n"
                        + f"{storage_path_link}"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )  # pylint: disable=C0301

                log_mod(
                    msg="MODE OVERWRITE: Table DOSEN'T EXISTS\nStart to CREATE HEADER file",
                    index=idx,
                    mod=log_number_of_batches,
                )
                header_path = dump_header_to_file(data_path=saved_files[0])
                log_mod(
                    "MODE OVERWRITE: Created HEADER file:\n" f"{header_path}",
                    index=idx,
                    mod=log_number_of_batches,
                )

                tb.create(
                    path=header_path,
                    if_storage_data_exists="replace",
                    if_table_exists="replace",
                    biglake_table=biglake_table,
                    dataset_is_public=dataset_is_public,
                )

                log_mod(
                    msg=(
                        "MODE OVERWRITE: Sucessfully CREATED TABLE\n"
                        + f"{table_staging}\n"
                        + f"{storage_path_link}"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )

                st.delete_table(
                    mode="staging", bucket_name=st.bucket_name, not_found_ok=True
                )
                log_mod(
                    msg=(
                        f"MODE OVERWRITE: Sucessfully REMOVED HEADER DATA from Storage\n:"
                        + f"{storage_path}\n"
                        + f"{storage_path_link}"
                    ),
                    index=idx,
                    mod=log_number_of_batches,
                )  # pylint: disable=C0301
                cleared_table = True

        log_mod(
            msg="STARTING UPLOAD TO GCS",
            index=idx,
            mod=log_number_of_batches,
        )
        if tb.table_exists(mode="staging"):
            # Upload them all at once
            tb.append(filepath=prepath, if_exists="replace")
            log_mod(
                msg="STEP UPLOAD: Sucessfully uploaded all batch files to Storage",
                index=idx,
                mod=log_number_of_batches,
            )
            for saved_file in saved_files:
                # Delete the files
                saved_file.unlink()
        else:
            # pylint: disable=C0301
            log_mod(
                msg="STEP UPLOAD: Table does not exist in STAGING, need to create first",
                index=idx,
                mod=log_number_of_batches,
            )

        # Get next batch.
        batch = database.fetch_batch(batch_size)
        idx += 1

    log(
        msg=f"Successfully dumped {idx} batches with size {len(batch)}, total of {idx*batch_size}",
    )

This task will dump and upload batches of data, sequentially.

def format_partitioned_query(query: str, dataset_id: str, table_id: str, database_type: str, partition_columns: List[str] = None, lower_bound_date: str = None, date_format: str = None, wait=None)

Expand source code

@task(
    checkpoint=False,
    max_retries=constants.TASK_MAX_RETRIES.value,
    retry_delay=timedelta(seconds=constants.TASK_RETRY_DELAY.value),
)
def format_partitioned_query(
    query: str,
    dataset_id: str,
    table_id: str,
    database_type: str,
    partition_columns: List[str] = None,
    lower_bound_date: str = None,
    date_format: str = None,
    wait=None,  # pylint: disable=unused-argument
):
    """
    Formats a query for fetching partitioned data.
    """
    # If no partition column is specified, return the query as is.
    if not partition_columns or partition_columns[0] == "":
        log("NO partition column specified. Returning query as is")
        return query

    partition_column = partition_columns[0]

    # Check if the table already exists in BigQuery.
    table = bd.Table(dataset_id, table_id)

    # If it doesn't, return the query as is, so we can fetch the whole table.
    if not table.table_exists(mode="staging"):
        log("NO tables was found. Returning query as is")
        return query

    blobs = get_storage_blobs(dataset_id, table_id)

    # extract only partitioned folders
    storage_partitions_dict = parser_blobs_to_partition_dict(blobs)
    # get last partition date
    last_partition_date = extract_last_partition_date(
        storage_partitions_dict, date_format
    )

    if lower_bound_date == "current_year":
        lower_bound_date = datetime.now().replace(month=1, day=1).strftime("%Y-%m-%d")
        log(f"Using lower_bound_date current_year: {lower_bound_date}")
    elif lower_bound_date == "current_month":
        lower_bound_date = datetime.now().replace(day=1).strftime("%Y-%m-%d")
        log(f"Using lower_bound_date current_month: {lower_bound_date}")
    elif lower_bound_date == "current_day":
        lower_bound_date = datetime.now().strftime("%Y-%m-%d")
        log(f"Using lower_bound_date current_day: {lower_bound_date}")

    if lower_bound_date:
        last_date = min(str(lower_bound_date), str(last_partition_date))
        log(f"Using lower_bound_date: {last_date}")

    else:
        last_date = str(last_partition_date)
        log(f"Using last_date from storage: {last_date}")

    # Using the last partition date, get the partitioned query.
    # `aux_name` must be unique and start with a letter, for better compatibility with
    # multiple DBMSs.
    aux_name = f"a{uuid4().hex}"[:8]

    log(
        f"Partitioned DETECTED: {partition_column}, retuning a NEW QUERY "
        "with partitioned columns and filters"
    )
    if database_type == "oracle":
        oracle_date_format = "YYYY-MM-DD" if date_format == "%Y-%m-%d" else date_format

        return f"""
        with {aux_name} as ({query})
        select * from {aux_name}
        where {partition_column} >= TO_DATE('{last_date}', '{oracle_date_format}')
        """

    return f"""
    with {aux_name} as ({query})
    select * from {aux_name}
    where {partition_column} >= '{last_date}'
    """

Formats a query for fetching partitioned data.

def parse_comma_separated_string_to_list(text: str) ‑> List[str]

Expand source code

@task
def parse_comma_separated_string_to_list(text: str) -> List[str]:
    """
    Parses a comma separated string to a list.

    Args:
        text: The text to parse.

    Returns:
        A list of strings.
    """
    if text is None or not text:
        return []
    # Remove extras.
    text = text.replace("\n", "")
    text = text.replace("\r", "")
    text = text.replace("\t", "")
    while ",," in text:
        text = text.replace(",,", ",")
    while text.endswith(","):
        text = text[:-1]
    result = [x.strip() for x in text.split(",")]
    result = [item for item in result if item != "" and item is not None]
    return result

Parses a comma separated string to a list.

Args

text: The text to parse.

Returns

A list of strings.