Module `pipelines.rj_cor.meteorologia.radar.precipitacao.src.utils.models_utils`

Functions

def get_predict_filepaths_and_log(args: dict, model_name: str) ‑> dict

Expand source code

def get_predict_filepaths_and_log(args: dict, model_name: str) -> dict:
    standard_arguments = [
        "overwrite",
        "personal",
        "verbose",
        "input_test_set_file",
        "n_jobs",
    ]

    model_parameters = [key for key in args.keys() if key not in standard_arguments]

    submodel_folder_name = ""
    for parameter in model_parameters:
        submodel_folder_name += f"{parameter}={args[parameter]}-"
    submodel_folder_name = submodel_folder_name[:-1]
    output_path = (
        args["personal"] * "personal/"
        + f"radar_cal/{args['input_test_set_file'].replace('_test.jsonl','').replace('_val.jsonl','')}/models/{model_name}/{submodel_folder_name}"
    )
    output_log_filepath = pathlib.Path(output_path + "/predict_log.jsonl")
    output_predict_filepath = pathlib.Path(output_path + "/predict.npy")

    log_entry = initialize_log_entry(args)
    log_entry["input_files"] = {"test_set_file": "", "model_file": ""}
    log_entry["output_files"] = {
        "predict": str(output_predict_filepath),
        "log": str(output_log_filepath),
    }

    input_model_filepath = pathlib.Path(f"{output_path}/model.joblib")
    test_set_filepath = pathlib.Path(
        args["personal"] * "personal/"
        + "data/processed/radar_cal/"
        + args["input_test_set_file"]
    )

    log_entry["input_files"]["test_set_file"] = str(test_set_filepath)
    log_entry["input_files"]["model_file"] = str(input_model_filepath)

    if not test_set_filepath.is_file():
        error_message = "Error: Specified test set file does not exist. Terminating..."
        log_entry = print_and_log(error_message, log_entry, print_error)
        save_log(log_entry, output_log_filepath)
        exit()

    if not input_model_filepath.is_file():
        error_message = (
            "Error: Specified input model file does not exist. Terminating..."
        )
        log_entry = print_and_log(error_message, log_entry, print_error)
        save_log(log_entry, output_log_filepath)
        exit()

    if output_predict_filepath.is_file():
        if args["overwrite"]:
            warning_message = f"Warning: overwriting existing prediction file {output_predict_filepath}"
            log_entry = print_and_log(
                warning_message, log_entry, print_warning, args["verbose"]
            )
            output_predict_filepath.unlink()
        else:
            error_message = f"Error: prediction file {output_predict_filepath} already exists. Pass -o to overwrite. Terminating..."
            log_entry = print_and_log(error_message, log_entry, print_error)
            save_log(log_entry, output_log_filepath)
            exit()

    return {
        "output_log_filepath": output_log_filepath,
        "output_predict_filepath": output_predict_filepath,
        "test_set_filepath": test_set_filepath,
        "input_model_filepath": input_model_filepath,
        "log_entry": log_entry,
    }

def get_train_filepaths_and_log(args: dict, model_name: str) ‑> dict

Expand source code

def get_train_filepaths_and_log(args: dict, model_name: str) -> dict:
    standard_arguments = ["overwrite", "personal", "verbose", "input_file", "n_jobs"]

    model_parameters = [key for key in args.keys() if key not in standard_arguments]

    submodel_folder_name = ""
    for parameter in model_parameters:
        submodel_folder_name += f"{parameter}={args[parameter]}-"
    submodel_folder_name = submodel_folder_name[:-1]
    output_path = (
        args["personal"] * "personal/"
        + f"radar_cal/{args['input_file'].replace('_train.jsonl','')}/models/{model_name}/{submodel_folder_name}"
    )

    output_log_filepath = pathlib.Path(output_path + "/train_log.jsonl")
    pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

    log_entry = initialize_log_entry(args)
    log_entry["input_file"] = ""
    log_entry["output_files"] = {
        "model": "",
        "params": "",
        "log": str(output_log_filepath),
    }

    train_set_filepath = pathlib.Path(
        args["personal"] * "personal/"
        + "data/processed/radar_cal/"
        + args["input_file"]
    )
    log_entry["input_file"] = str(train_set_filepath)

    if not train_set_filepath.is_file():
        error_message = "Error: Specified input file does not exist. Terminating..."
        log_entry["shell_output"] += f"{error_message}\n"
        print_error(error_message)
        with open(output_log_filepath, "a") as outfile:
            json.dump(log_entry, outfile)
            outfile.write("\n")
        exit()

    output_model_filepath = pathlib.Path(output_path + "/model.joblib")
    output_parameters_filepath = pathlib.Path(output_path + "/params.json")

    log_entry["output_files"]["model"] = str(output_model_filepath)
    log_entry["output_files"]["params"] = str(output_parameters_filepath)

    if output_model_filepath.is_file():
        if args["overwrite"]:
            warning_message = (
                f"Warning: overwriting existing model file {output_model_filepath}"
            )
            log_entry["shell_output"] += f"{warning_message}\n"
            print_warning(warning_message, verbose=args["verbose"])
            output_model_filepath.unlink()
        else:
            warning_message = f"Warning: model file {output_model_filepath} already exists. Pass -o to overwrite. Terminating..."
            log_entry["shell_output"] += f"{warning_message}\n"
            print_warning(warning_message)
            with open(output_log_filepath, "a") as outfile:
                json.dump(log_entry, outfile)
                outfile.write("\n")
            exit()

    if output_parameters_filepath.is_file():
        if args["overwrite"]:
            warning_message = f"Warning: overwriting existing parameters file {output_parameters_filepath}"
            log_entry["shell_output"] += f"{warning_message}\n"
            print_warning(warning_message, verbose=args["verbose"])
            output_parameters_filepath.unlink()
        else:
            warning_message = f"Warning: parameters file {output_parameters_filepath} already exists. Pass -o to overwrite."
            log_entry["shell_output"] += f"{warning_message}\n"
            print_warning(warning_message)
            with open(output_log_filepath, "a") as outfile:
                json.dump(log_entry, outfile)
                outfile.write("\n")
            exit()

    parameters_dict = dict([(key, args[key]) for key in model_parameters])
    parameters_dict["model_name"] = model_name

    return {
        "output_log_filepath": output_log_filepath,
        "output_model_filepath": output_model_filepath,
        "output_parameters_filepath": output_parameters_filepath,
        "train_set_filepath": train_set_filepath,
        "log_entry": log_entry,
        "parameters_dict": parameters_dict,
    }

def initialize_log_entry(args: dict) ‑> dict

Expand source code

def initialize_log_entry(args: dict) -> dict:
    log_entry = {}
    log_entry["parameters_passed"] = args
    log_entry["datetime"] = str(datetime.datetime.now()) + " BRT"
    log_entry["user"] = os.path.expanduser("~")
    repo = git.Repo(search_parent_directories=True)
    log_entry["git_hash"] = repo.head.object.hexsha
    log_entry["shell_output"] = ""

    return log_entry

def print_and_log(message: str, log_entry: dict, print_func, verbose: bool = True) ‑> dict

Expand source code

def print_and_log(
    message: str, log_entry: dict, print_func, verbose: bool = True
) -> dict:
    log_entry["shell_output"] += message + "\n"
    if verbose:
        print_func(message)
    return log_entry

def save_log(log_entry: dict, output_file)

Expand source code

def save_log(log_entry: dict, output_file):
    with open(output_file, "a") as outfile:
        json.dump(log_entry, outfile)
        outfile.write("\n")