Module `pipelines.rj_cor.meteorologia.satelite.tasks`

Tasks for emd

Functions

def create_image_and_upload_to_api(info: dict, output_filepath: pathlib.Path)

Expand source code

@task
def create_image_and_upload_to_api(info: dict, output_filepath: Path):
    """
    Create image from dataframe and send it to API
    """

    dfr = pd.read_csv(output_filepath)

    dfr = dfr.sort_values(by=["latitude", "longitude"], ascending=[False, True])

    for var in info["variable"]:
        log(f"\nStart creating image for variable {var}\n")

        var = var.lower()
        data_array = get_variable_values(dfr, var)

        # Get the pixel values
        data = data_array.data[:]
        log(f"\n[DEBUG] data {data}")
        save_image_path = create_and_save_image(data, info, var)
        log(f"\nStart uploading image for variable {var} on API\n")
        # upload_image_to_api(info, save_image_path)
        log(save_image_path)
        log(f"\nEnd uploading image for variable {var} on API\n")

Create image from dataframe and send it to API

def download(product: str, date_hour_info: dict, band: str = None, ref_filename: str = None, redis_files: list = [], wait=None, mode_redis: str = 'prod') ‑> str | pathlib.Path

Expand source code

@task(nout=2, max_retries=10, retry_delay=dt.timedelta(seconds=60))
def download(
    product: str,
    date_hour_info: dict,
    band: str = None,
    ref_filename: str = None,
    redis_files: list = [],
    wait=None,
    mode_redis: str = "prod",
) -> Union[str, Path]:
    """
    Access S3 or GCP and download the first file on this specified date hour
    that is not already saved on redis
    """

    year = date_hour_info["year"]
    julian_day = date_hour_info["julian_day"]
    hour_utc = date_hour_info["hour_utc"][:2]
    partition_path = f"ABI-L2-{product}/{year}/{julian_day}/{hour_utc}/"
    log(f"Getting files from {partition_path}")

    storage_files_path, storage_origin, storage_conection = get_files_from_aws(
        partition_path
    )
    log(storage_files_path)
    if len(storage_files_path) == 0:
        storage_files_path, storage_origin, storage_conection = get_files_from_gcp(
            partition_path
        )

    # Keep only files from specified band
    if product == "CMIPF":
        # para capturar banda 13
        storage_files_path = [
            f for f in storage_files_path if bool(re.search("C" + band, f))
        ]

    # Skip task if there is no file on API
    if len(storage_files_path) == 0:
        log("No available files on API")
        skip = Skipped("No available files on API")
        raise ENDRUN(state=skip)

    base_path = os.path.join(os.getcwd(), "temp", "input", mode_redis, product[:-1])

    if not os.path.exists(base_path):
        os.makedirs(base_path)

    # Seleciona primeiro arquivo que não tem o nome salvo no redis
    log(f"\n\n[DEBUG]: available files on API: {storage_files_path}")
    log(f"\n\n[DEBUG]: filenames that are already saved on redis_files: {redis_files}")

    redis_files, destination_file_path, download_file = choose_file_to_download(
        storage_files_path, base_path, redis_files, ref_filename
    )

    # Skip task if there is no new file
    if download_file is None:
        log("No new available files")
        skip = Skipped("No new available files")
        raise ENDRUN(state=skip)

    # Download file from aws or gcp
    if storage_origin == "aws":
        storage_conection.get(download_file, destination_file_path)
    else:
        download_blob(
            bucket_name=storage_conection,
            source_blob_name=download_file,
            destination_file_name=destination_file_path,
            mode="prod",
        )

    return destination_file_path, redis_files

Access S3 or GCP and download the first file on this specified date hour that is not already saved on redis

def get_dates(current_time, product) ‑> str

Expand source code

@task()
def get_dates(current_time, product) -> str:
    """
    Task para obter o dia atual caso nenhuma data tenha sido passada
    Subtraimos 5 minutos da hora atual pois o último arquivo que sobre na aws
    sempre cai na hora seguinte (Exemplo: o arquivo
    OR_ABI-L2-RRQPEF-M6_G16_s20230010850208_e20230010859516_c20230010900065.nc
    cujo início da medição foi às 08:50 foi salvo na AWS às 09:00:33).
    """
    if current_time is None:
        current_time = pendulum.now("UTC").subtract(minutes=5).to_datetime_string()
    # Product sst is updating one hour later
    if product == "SSTF":
        current_time = pendulum.now("UTC").subtract(minutes=55).to_datetime_string()
    return current_time

Task para obter o dia atual caso nenhuma data tenha sido passada Subtraimos 5 minutos da hora atual pois o último arquivo que sobre na aws sempre cai na hora seguinte (Exemplo: o arquivo OR_ABI-L2-RRQPEF-M6_G16_s20230010850208_e20230010859516_c20230010900065.nc cujo início da medição foi às 08:50 foi salvo na AWS às 09:00:33).

def save_data(info: dict, mode_redis: str = 'prod') ‑> str | pathlib.Path

Expand source code

@task(nout=2)
def save_data(info: dict, mode_redis: str = "prod") -> Union[str, Path]:
    """
    Concat all netcdf data and save partitioned by date on a csv
    """

    log("Start saving product on a csv")
    output_path, output_filepath = save_data_in_file(
        product=info["product"],
        variable=info["variable"],
        datetime_save=info["datetime_save"],
        mode_redis=mode_redis,
    )
    return output_path, output_filepath

Concat all netcdf data and save partitioned by date on a csv

def slice_data(current_time: str, ref_filename: str = None) ‑> dict

Expand source code

@task(nout=1)
def slice_data(current_time: str, ref_filename: str = None) -> dict:
    """
    slice data to separate in year, julian_day, month, day and hour in UTC
    """
    if ref_filename is not None:
        year, julian_day, hour_utc = extract_julian_day_and_hour_from_filename(
            ref_filename
        )
        month = None
        day = None
    else:
        year = current_time[:4]
        month = current_time[5:7]
        day = current_time[8:10]
        hour_utc = current_time[11:13]
        julian_day = dt.datetime.strptime(current_time, "%Y-%m-%d %H:%M:%S").strftime(
            "%j"
        )

    date_hour_info = {
        "year": str(year),
        "julian_day": str(julian_day),
        "month": str(month),
        "day": str(day),
        "hour_utc": str(hour_utc),
    }

    return date_hour_info

slice data to separate in year, julian_day, month, day and hour in UTC

def tratar_dados(filename: str) ‑> dict

Expand source code

@task
def tratar_dados(filename: str) -> dict:
    """
    Convert X, Y coordinates from netcdf file to a latlon coordinates
    and select only the specified region on extent variable.
    """
    log(f"\n Started treating file: {filename}")
    # Create the basemap reference for the Rectangular Projection.
    # You may choose the region you want.

    # Full Disk Extent
    # extent = [-156.00, -81.30, 6.30, 81.30]

    # Brazil region
    # extent = [-90.0, -40.0, -20.0, 10.0]

    # Estado do RJ
    # lat_max, lon_max = (-20.69080839963545, -40.28483671464648)
    # lat_min, lon_min = (-23.801876626302175, -45.05290312102409)

    # Região da cidade do Rio de Janeiro
    # lat_max, lon_min = (-22.802842397418548, -43.81200531887697)
    # lat_min, lon_max = (-23.073487725280266, -43.11300020870994)

    # Recorte da região da cidade do Rio de Janeiro segundo meteorologista
    lat_max, lon_max = (
        -21.699774257353113,
        -42.35676996062447,
    )  # canto superior direito
    lat_min, lon_min = (
        -23.801876626302175,
        -45.05290312102409,
    )  # canto inferior esquerdo

    extent = [lon_min, lat_min, lon_max, lat_max]

    # Get informations from the nc file
    product_caracteristics = get_info(filename)
    product_caracteristics["extent"] = extent

    # Call the remap function to convert x, y to lon, lat and save converted file
    remap_g16(
        filename,
        extent,
        product=product_caracteristics["product"],
        variable=product_caracteristics["variable"],
    )

    return product_caracteristics

Convert X, Y coordinates from netcdf file to a latlon coordinates and select only the specified region on extent variable.