From 38c9e926be38d953fcda50de078cde603ed92469 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Fri, 1 Nov 2024 17:16:11 -0400 Subject: [PATCH 01/10] Add `get_json_config` function --- climateset/utils.py | 46 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/climateset/utils.py b/climateset/utils.py index d93464b..a325ce9 100644 --- a/climateset/utils.py +++ b/climateset/utils.py @@ -1,3 +1,4 @@ +import json import logging import pathlib import sys @@ -85,7 +86,7 @@ def get_yaml_config(yaml_config_file: Union[str, pathlib.Path], logger: logging. if isinstance(yaml_config_file, str): yaml_config_file = pathlib.Path(yaml_config_file) potential_paths = [ - pathlib.Path(yaml_config_file), + yaml_config_file, CONFIGS / yaml_config_file, CONFIGS / f"{yaml_config_file}.yaml", CONFIGS / f"{yaml_config_file}.yml", @@ -110,3 +111,46 @@ def get_yaml_config(yaml_config_file: Union[str, pathlib.Path], logger: logging. except yaml.YAMLError as e: logger.warning(f"Error loading YAML file [{config_filepath}]: {e}") return {} + + +def get_json_config(json_config_file: Union[str, pathlib.Path], logger=LOGGER) -> dict: + """ + This function takes in the path, or name of the file if it can be found in the config/ folder, with of without the + extension, and returns the values of the file in a dictionary format. + + Ex. For a file named app_config.json, directly in the config/ folder, + the function could be called like so : `params = get_json_config('app_config')` + + Args: + json_config_file: Path to JSON config file. If config file is in the config folder, + logger: Logger to handle messaging, by default LOGGER + + Returns: + Dictionary of JSON configuration values + """ + if isinstance(json_config_file, str): + json_config_file = pathlib.Path(json_config_file) + potential_paths = [ + json_config_file, + CONFIGS / json_config_file, + CONFIGS / f"{json_config_file}.json", + ] + + config_filepath = None + for path in potential_paths: + if path.exists(): + config_filepath = path + logger.info(f"JSON config file [{str(path)}] found.") + break + + if not config_filepath: + logger.error(f"JSON config file [{json_config_file}] not found.") + return {} + + try: + with config_filepath.open("r", encoding="UTF-8") as file: + logger.info(f"Loading JSON config file [{config_filepath}].") + return json.load(file) + except json.JSONDecodeError as e: + logger.warning(f"Error loading JSON file [{config_filepath}]: {e}") + return {} From f1fed4937865b7b5d7780b8c3561aa990e83c858 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Fri, 1 Nov 2024 17:23:20 -0400 Subject: [PATCH 02/10] First import of processing and start of refactor --- climateset/processing/__init__.py | 0 climateset/processing/raw/__init__.py | 0 .../processing/raw/abstract_raw_processing.py | 104 ++ climateset/processing/raw/checker.py | 389 ++++++ climateset/processing/raw/cmip6_processing.py | 232 ++++ .../processing/raw/input4mips_processing.py | 1173 +++++++++++++++++ climateset/processing/raw/utils.py | 18 + climateset/processing/resolution/__init__.py | 0 configs/processing/raw_processing_params.json | 52 + tests/test_processing/__init__.py | 0 tests/test_processing/test_raw_processing.py | 61 + 11 files changed, 2029 insertions(+) create mode 100644 climateset/processing/__init__.py create mode 100644 climateset/processing/raw/__init__.py create mode 100644 climateset/processing/raw/abstract_raw_processing.py create mode 100644 climateset/processing/raw/checker.py create mode 100644 climateset/processing/raw/cmip6_processing.py create mode 100644 climateset/processing/raw/input4mips_processing.py create mode 100644 climateset/processing/raw/utils.py create mode 100644 climateset/processing/resolution/__init__.py create mode 100644 configs/processing/raw_processing_params.json create mode 100644 tests/test_processing/__init__.py create mode 100644 tests/test_processing/test_raw_processing.py diff --git a/climateset/processing/__init__.py b/climateset/processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/climateset/processing/raw/__init__.py b/climateset/processing/raw/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/climateset/processing/raw/abstract_raw_processing.py b/climateset/processing/raw/abstract_raw_processing.py new file mode 100644 index 0000000..e94b535 --- /dev/null +++ b/climateset/processing/raw/abstract_raw_processing.py @@ -0,0 +1,104 @@ +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Union + +from climateset import CONFIGS +from climateset.processing.raw.checker import ( + AbstractDirectoryChecker, + BasicDirectoryChecker, +) +from climateset.utils import create_logger, get_json_config + +LOGGER = create_logger(__name__) + + +class AbstractRawProcesser(ABC): + """Abstract class for raw processing.""" + + def __init__( + self, + input_directory: Union[str, Path], + working_directory: Union[str, Path], + processing_steps: list = None, + checker: AbstractDirectoryChecker = None, + processing_parameters_config: Union[str, Path] = CONFIGS / "processing" / "raw_processing_params.json", + ): + """Init + Args: + + """ + # type of class + self.type_class = self.type_class_meta() + self.input_directory = input_directory + if isinstance(input_directory, str): + self.input_directory = Path(input_directory) + self.working_directory = working_directory + if isinstance(working_directory, str): + self.working_directory = Path(working_directory) + self.processing_steps = processing_steps + if not self.processing_steps: + self.processing_steps = [] + self.available_steps = {} + self.processing_params_config: Union[str, Path] = processing_parameters_config + self.meta_raw_dict = get_json_config(self.processing_params_config) + self.calendar = self.meta_raw_dict["calendar"] + self.desired_units = self.meta_raw_dict["units"] + + self.checker = checker + if not self.checker: + self.checker: AbstractDirectoryChecker = BasicDirectoryChecker(directory=self.input_directory) + + @abstractmethod + def type_class_meta(self) -> str: + """Returns the name tag of the subclass.""" + + @abstractmethod + def preprocess_subdir( + self, + input_dir: Path, + cleaned_dir: Path, + processed_dir: Path, + load_dir: Path, + overwrite: bool, + silent: bool, + sum_sec_input_res: str, + sum_sec_input_freq: str, + ): + """ + Preprocessing a subdir - must be implemented for each subclass. + Args: + input_dir: + cleaned_dir: + processed_dir: + load_dir: + overwrite: + silent: + sum_sec_input_res: + sum_sec_input_freq: + + Returns: + + """ + + @abstractmethod + def file_belongs_to_type(self, input_file: Path) -> bool: + """ + Checks if a file belongs to input4mips or cmip6 category. + + Args: + input_file (Path): the file that should be checked + Returns: + True if it belong to the class type, False if not + """ + + def add_processing_step(self, steps: Union[str, list]): + if isinstance(steps, str): + steps = [steps] + for step in steps: + if step in self.available_steps: + self.processing_steps.append(step) + + def list_available_steps(self): + step_list = [step for step in self.available_steps.keys() if not step.startswith("_")] + LOGGER.info(f"Available steps: {step_list}") + return step_list diff --git a/climateset/processing/raw/checker.py b/climateset/processing/raw/checker.py new file mode 100644 index 0000000..037bd8c --- /dev/null +++ b/climateset/processing/raw/checker.py @@ -0,0 +1,389 @@ +import gc +import json +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Optional, Union + +import numpy as np +import xarray as xr + +from climateset import CONFIGS +from climateset.utils import create_logger, get_json_config + +LOGGER = create_logger(__name__) + +VERIFY_FILE = "verify_file_ok" +VERIFY_UNITS = "verify_units" +VERIFY_RESOLUTION = "verify_resolution" +VERIFY_LONLAT = "verify_lonlat" +VERIFY_VARIABLES = "verify_variables" + +AVAILABLE_RAW_CHECKS = frozenset([VERIFY_UNITS, VERIFY_RESOLUTION, VERIFY_LONLAT, VERIFY_VARIABLES]) + + +class AbstractDirectoryChecker(ABC): + def __init__(self, directory: Union[str, Path], checker_steps: list = None): + if isinstance(directory, str): + directory = Path(directory) + self.directory: Union[str, Path] = directory + self.checker_steps: list = checker_steps + if not self.checker_steps: + self.checker_steps = [] + + self.results = [] + self.errors = [] + + @abstractmethod + def check_directory(self): + raise NotImplementedError + + def write_results(self, base_path: Path): + if self.results: + filename = base_path / "results.json" + with open(filename, "w", encoding="utf-8") as json_file: + json.dump(self.results, json_file, indent=2) + if self.errors: + filename = base_path / "errors.json" + with open(filename, "w", encoding="utf-8") as json_file: + json.dump(self.errors, json_file, indent=2) + + +class AbstractFileChecker(ABC): + def __init__(self, input_file: Union[str, Path], checker_steps: list = None, available_checks: dict = None): + self.input_file: Union[str, Path] = input_file + self.checker_steps: list = checker_steps + if not self.checker_steps: + self.checker_steps = [] + self.available_steps = available_checks + self.results = {} + self.dataset: Optional[xr.Dataset] = None + + def check(self): + self.check_file_ok() + for step in self.checker_steps: + self.available_steps[step]() + return {"filename": self.input_file.name, "results": self.results} + + def check_file_ok(self): + try: + LOGGER.info(f"Trying to load file {self.input_file}") + ds = xr.load_dataset(self.input_file) + self.dataset = ds + LOGGER.info(f"Successfully loaded file {self.input_file}") + self.results["file_ok"] = True + + except ValueError: + self.results["file_ok"] = False + LOGGER.warning(f"The following file is corrupt:\n {self.input_file}") + self.dataset = None + + def cleanup_memory(self): + del self.dataset + gc.collect() + + +class BasicDirectoryChecker(AbstractDirectoryChecker): + def __init__( + self, + directory: Union[str, Path] = None, + checker_steps: list = None, + ): + super().__init__(directory=directory, checker_steps=checker_steps) + + def check_directory(self) -> bool: + """ + Checking all files in a sub dir. + + TAKES TIME. + Args: + + Returns: + bool: True if all checks were successful, False if not + """ + for f in self.directory.glob("*.nc"): + file_checker = BasicFileChecker(input_file=f, checker_steps=self.checker_steps) + file_checker.check() + if not all(file_checker.results.values()): + self.errors.append(file_checker.results) + self.results.append(file_checker.results) + file_checker.cleanup_memory() + + if self.errors: + LOGGER.warning("There were errors running while running the checks") + LOGGER.warning(f"Errors : {self.errors}") + return False + + LOGGER.info("All checks passed") + return True + + +class BasicFileChecker(AbstractFileChecker): + def __init__( + self, + input_file, + checker_steps: list = None, + ): + super().__init__( + input_file=input_file, checker_steps=checker_steps, available_checks={VERIFY_FILE: self.check_file_ok} + ) + + def check(self): + """ + Logs warnings if there are inconsistent files. + + Checks always + for corruptness, the rest is optional. See initialization of class. + Args: + input_file (Path): The path that should be checked + log_file (Path): Where error / warnings should be logged additionally. + mode (str): 'w' writes the logs into the file, 'a' appends it to + an existing file. + Returns: + True if all checks were passed, False if not. + """ + + self.check_file_ok() + + +class RawDirectoryChecker(AbstractDirectoryChecker): + def __init__( + self, + directory: Union[str, Path] = None, + checker_steps: list = None, + type_class=None, + lonlat_resolution_to_verify: str = "50_km", + processing_params_config: Union[str, Path] = CONFIGS / "processing" / "raw_processing_params.json", + ): + super().__init__(directory=directory, checker_steps=checker_steps) + self.type_class = type_class + self.lonlat_resolution_to_verify = lonlat_resolution_to_verify + self.meta_raw_dict = get_json_config(processing_params_config) + self.calendar = self.meta_raw_dict["calendar"] + self.desired_units = self.meta_raw_dict["units"] + + lons = self.meta_raw_dict[self.lonlat_resolution_to_verify][f"{self.type_class}_lon"] + lats = self.meta_raw_dict[self.lonlat_resolution_to_verify][f"{self.type_class}_lat"] + self.lon_specs = (float(lons["min"]), float(lons["max"]), float(lons["step"])) # (-179.75, 179.75, 0.5) + self.lat_specs = (float(lats["min"]), float(lats["max"]), float(lats["step"])) # (-89.75, 89.75, 0.5) + + def add_unit_check(self): + self.checker_steps.append(VERIFY_UNITS) + + def add_resolution_check(self): + self.checker_steps.append(VERIFY_RESOLUTION) + + def add_lonlat_check(self): + self.checker_steps.append(VERIFY_LONLAT) + + def add_variables_check(self): + self.checker_steps.append(VERIFY_VARIABLES) + + def check_directory(self, log_file: Path = None, model: str = "") -> bool: + """ + Checking all files in a sub dir. + + TAKES TIME. + Args: + sub_dir (Path): The dir that should be checked + log_file (Path): Where problematic files should be logged + mode (str): 'w' writes the logs into the file, 'a' appends it to + an existing file. + model (str): Default is "". Set this, if you want that only + the data of this model is checked. + Returns: + bool: True if all checks were successful, False if not + """ + + for f in self.directory.rglob("*.nc"): + file_checker = RawFileChecker( + input_file=f, + type_class=self.type_class, + checker_steps=self.checker_steps, + lonlat_resolution_to_verify=None, + calendar=self.calendar, + units=self.desired_units, + lon_specs=self.lon_specs, + lat_specs=self.lat_specs, + expected_units=self.meta_raw_dict["units"], + ) + file_checker.check() + if not all(file_checker.results.values()): + self.errors.append(file_checker.results) + self.results.append(file_checker.results) + file_checker.cleanup_memory() + + if self.errors: + LOGGER.warning("There were errors running while running the checks") + LOGGER.warning(f"Errors : {self.errors}") + return False + + LOGGER.info("All checks passed") + return True + + +class RawFileChecker(AbstractFileChecker): + def __init__( + self, + input_file, + type_class=None, + checker_steps: list = None, + lonlat_resolution_to_verify=None, + calendar=None, + units=None, + lon_specs=None, + lat_specs=None, + expected_units=None, + ): + super().__init__( + input_file=input_file, + checker_steps=checker_steps, + available_checks={ + VERIFY_UNITS: self.check_units_ok, + VERIFY_RESOLUTION: self.check_resolution, + VERIFY_LONLAT: self.check_lonlat, + VERIFY_VARIABLES: self.check_variables, + }, + ) + self.type_class = type_class + self.lonlat_resolution_to_verify = lonlat_resolution_to_verify + self.calendar = calendar + self.desired_units = units + self.lon_specs = lon_specs + self.lat_specs = lat_specs + self.results = {} + self.expected_units = expected_units + + def check(self): + """ + Logs warnings if there are inconsistent files. + + Checks always + for corruptness, the rest is optional. See initialization of class. + Args: + input_file (Path): The path that should be checked + log_file (Path): Where error / warnings should be logged additionally. + mode (str): 'w' writes the logs into the file, 'a' appends it to + an existing file. + Returns: + True if all checks were passed, False if not. + """ + + self.check_file_ok() + for step in self.checker_steps: + self.available_steps[step]() + + def check_variables(self): + try: + actual_var = self.dataset.variable_id + # cmip6 case + if self.type_class == "cmip6": + named_var = str(self.input_file.name).split("_")[4] + # input4mips case + else: + named_var = str(self.input_file.parents[0]).split("/")[-4] + if "sum" in named_var: + named_var = named_var.split("_")[0] + + if actual_var != named_var: + self.results["var_ok"] = False + LOGGER.warning( + "The following file contains the variable {}, but the variable {} was expected: \n{}".format( + actual_var, named_var, self.input_file + ) + ) + else: + self.results["var_ok"] = True + except AttributeError: + self.results["var_ok"] = False + LOGGER.warning(f"Dataset does not contain variable_id: \n {self.input_file}") + + def check_lonlat(self, ds, check_dict, input_file, passed_checks): + lon_min, lon_max, lon_step = self.lon_specs # (-179.75, 179.75, 0.5) + lat_min, lat_max, lat_step = self.lat_specs # (-89.75, 89.75, 0.5) + try: + found_lon_min, found_lon_max = ds.lon[0].item(), ds.lon[-1].item() + found_lat_min, found_lat_max = ds.lat[0].item(), ds.lat[-1].item() + found_lon_step = abs(ds.lon[0].item() - ds.lon[1].item()) + found_lat_step = abs(ds.lat[0].item() - ds.lat[1].item()) + + # longitude checks + if (lon_min == found_lon_min) and (lon_max == found_lon_max) and (np.isclose(lon_step, found_lon_step)): + check_dict["lon_ok"] = True + else: + passed_checks = False + check_dict["lon_ok"] = False + LOGGER.warning(f"The following file has unexpected lon values: \n{input_file}") + + # latitude checks + if (lat_min == found_lat_min) and (lat_max == found_lat_max) and (np.isclose(lat_step, found_lat_step)): + check_dict["lat_ok"] = True + else: + passed_checks = False + check_dict["lat_ok"] = False + LOGGER.warning(f"The following file has unexpected lat values: \n{input_file}") + + except AttributeError: + passed_checks = False + check_dict["lon_ok"] = False + check_dict["lat_ok"] = False + LOGGER.warning(f"Dataset contains different namings for longitude / latitude. \n {input_file}") + return passed_checks + + def check_resolution(self): + try: + actual_spat_res = self.dataset.attrs["nominal_resolution"] + actual_spat_res = actual_spat_res.replace("_", " ") # just in case + named_spat_res = " ".join(str(self.input_file.name).split("_")[-5:-3]) + if actual_spat_res != named_spat_res: + self.results["res_ok"] = False + LOGGER.warning( + "The following file has the nominal resolution {}, but the resolution {} was expected: \n{}".format( + actual_spat_res, named_spat_res, self.input_file + ) + ) + else: + self.results["res_ok"] = True + + actual_temp_res = self.dataset.attrs["frequency"] + named_temp_res = str(self.input_file.name).split("_")[-3] + if actual_temp_res != named_temp_res: + self.dataset["freq_ok"] = False + LOGGER.warning( + "The following file has the temporal frequency {}, but the frequency {} was expected: \n{}".format( + actual_temp_res, named_temp_res, self.input_file + ) + ) + else: + self.dataset["freq_ok"] = True + except AttributeError: + self.dataset["res_ok"] = False + self.dataset["freq_ok"] = False + LOGGER.warning(f"The following file gives no access to nominal resolution or frequency: {self.input_file}") + + def check_units_ok(self): + try: + var = self.dataset.variable_id + found_unit = None + + try: + found_unit = self.dataset[var].units + except AttributeError: + try: + found_unit = self.dataset["variable_units"] + except AttributeError: + raise KeyError(f"Unit could not be found in file {self.input_file}") + + expected_unit = self.expected_units[var] + if found_unit != expected_unit: + self.results["units_ok"] = False + LOGGER.warning( + "The following file has the unit {}, but the unit {} was expected: \n{}".format( + found_unit, expected_unit, self.input_file + ) + ) + else: + self.results["units_ok"] = True + + except AttributeError: + self.results["units_ok"] = False + LOGGER.warning(f"The following file does not allow us to call ds.variable_id:\n{self.input_file}") diff --git a/climateset/processing/raw/cmip6_processing.py b/climateset/processing/raw/cmip6_processing.py new file mode 100644 index 0000000..61a2711 --- /dev/null +++ b/climateset/processing/raw/cmip6_processing.py @@ -0,0 +1,232 @@ +import os +import re +import subprocess +import warnings +from pathlib import Path + +import xarray as xr +import xmip +import xmip.preprocessing as xmip_preprocessing +from tqdm import tqdm + +from climateset.processing.raw.abstract_raw_processing import AbstractRawProcesser +from climateset.processing.raw.utils import create_generic_output_path + + +# Attention: we need to write _desired_units for our all our data +# Attention: Before you apply, make sure this is only applied to CMIP6 data! +class ClimateModelProcesser(AbstractRawProcesser): + """ + Can be called to apply xmip preprocessing. + + Works only on Cmip6 Data! + """ + + def __init__( + self, + rename_ds: bool = True, + correct_lonlat: bool = True, + correct_units: bool = True, + fix_bounds: bool = True, + verify_units: bool = False, + verify_resolution: bool = False, + verify_lonlat: bool = False, + verify_num_of_files: bool = False, + verify_variables: bool = False, + correct_time_axis: bool = True, + correct_calendar: bool = True, + sum_levels: bool = True, + **kwargs, + ): + """Init function for xmip processer + Args: + rename_ds (bool): Make naming in cmip6 consistent + correct_lonlat (bool): Make lonlat naming consistent + correct_units (bool): Make units consistent + fix_bounds (bool): Fix bounds and vertex + verify_units (bool): Makes sure that all ghg conform to the same units + verify_resolution (bool): Checks if the res in the file name is the same like in the file + verify_lonlat (bool): Checks if the longitude, latitude values are as expected + verify_num_of_files (bool): Checks how many files are in the final dir. + raises a warning if more or less than one. + verify_variables (bool): Checks if the variables are also the expected vars. + correct_calendar (bool): Makes sure that the right calendar is used. + correct_time_axis (bool): Makes sure that each file starts at the first + of each time-unit and shifts if not. + sum_levels (bool): Makes sure that all emissions are sumed over the + different levels. + """ + # init abstract class with checks + super().__init__( + verify_units=verify_units, + verify_resolution=verify_resolution, + verify_lonlat=verify_lonlat, + verify_num_of_files=verify_num_of_files, + verify_variables=verify_variables, + ) + + # xmip processing steps + self.rename_ds = rename_ds + self.correct_lonlat = correct_lonlat + self.correct_units = correct_units + self.fix_bounds = fix_bounds + + # climatevers processing steps + self.correct_calendar = correct_calendar + self.correct_time_axis = correct_time_axis + self.sum_levels = sum_levels + + # update the desired unit dicts for unit correction of xmip + for var, unit in self.desired_units.items(): + self.update_desired_units_dict(var, unit) + + def type_class_meta(self) -> str: + """Returns the name tag of the subclass.""" + return "cmip6" + + # TODO share this in process class with input4mips / cmip6 + def file_belongs_to_type(self, input_file: Path) -> bool: + """ + Check if the name cmip6 appears in the path name. + + Args: + input_file (Path): the file that should be checked for cmip6 + Returns: + True if name indicates it's a cmip6 file, False if not. + """ + return bool(re.search("cmip6", str(input_file), re.IGNORECASE)) + + # Is used internally and can be updated externally + def update_desired_units_dict(self, var: str, unit: str): + """ + Must be applied before running xmip processing. + + Updates the global + dict of xmip to include which units you want to use throughout all models. + Args: + var (str): the variable that shall be added as key + unit (str): the desired unit that all models should use as value + """ + # adapt global variable from xmip + xmip.preprocessing._desired_units[var] = unit + + # share this with input4mips? + def preprocess_subdir( + self, + sub_dir: Path, + output_dir: Path, + xmip: bool = True, + climateverse: bool = True, + model: str = "", + overwrite: bool = False, + threads: int = 1, + ): + """ + Applies the preprocessing to a complete directory. + + Args: + sub_dir (Path): directory on which xmip processing should be applied + output_dir (Path): where the preprocessed files should be stored + xmip (bool): if xmip processing should be applied + climateverse (bool): if additional processing from us should be applied + model (str): Default is "", i.e. all models are processed the + same, all at once. Set this to a specific model if you want instead. + overwrite (bool): If data can be overwritten or not + """ + print(f"Start Cmip6 preprocessing of {sub_dir}.") + + # loop through sub_dir to remap all files in here + # total_files = len(list(sub_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(sub_dir)): + if len(files) > 0: + for file in files: + # create output dir + input_file = Path(path) / file + + if self.file_belongs_to_type(input_file) and (model in str(input_file)): + # create output file + if sub_dir == output_dir: + output_file = input_file + else: + output_file = create_generic_output_path(output_dir, path, file) + + # process + if (not output_file.is_file()) or overwrite: + if climateverse and xmip: + self.climateverse_process(input_file, output_file, threads=threads) + self.xmip_process(output_file, output_file) + elif climateverse: + self.climateverse_process(input_file, output_file, threads=threads) + elif xmip: + self.xmip_process(input_file, output_file) + else: + warnings.warn("No processing requested.") + + print(f"Finished Cmip6 preprocessing of {sub_dir} and saved it at {output_dir}.") + + def xmip_process(self, input_file: Path, output_file: Path): + """ + Applies the xmip processing to a specific file. + + Make sure this is + a CMIP6 file beforehand. + Args: + input_file (Path): Input netcdf cmip6 file that shall be processed. + output_file (Path): Where the processed dataset shall be stored. + """ + # load file as dataset + ds = xr.load_dataset(input_file) + + # call preprocessing + if self.rename_ds: + ds = xmip_preprocessing.rename_cmip6(ds) + if self.correct_lonlat: + ds = xmip_preprocessing.promote_empty_dims(ds) + ds = xmip_preprocessing.correct_coordinates(ds) + ds = xmip_preprocessing.broadcast_lonlat(ds) + ds = xmip_preprocessing.correct_lon(ds) + if self.correct_units: + ds = xmip_preprocessing.correct_units(ds) + if self.fix_bounds: + ds = xmip_preprocessing.parse_lon_lat_bounds(ds) + ds = xmip_preprocessing.sort_vertex_order(ds) + ds = xmip_preprocessing.maybe_convert_bounds_to_vertex(ds) + ds = xmip_preprocessing.maybe_convert_vertex_to_bounds(ds) + ds = xmip_preprocessing.fix_metadata(ds) + ds = ds.drop_vars(xmip_preprocessing._drop_coords, errors="ignore") + + # store at new location (not raw anymore) + ds.to_netcdf(output_file, mode="w", format="NETCDF4", engine="netcdf4") + + def climateverse_process(self, input_file: Path, output_file: Path, threads: int = 1): + """ + Applies additional processing steps that are not included in xmip. Needs to be applied BEFORE xmip processing + because it's using cdo and xmip breaks cdo formatting (at least for me so far). + + Args: + input_file (Path): The file that should be processed. + output_file (Path): Where the resulting file is stored. + threads (int): threads for cdo + """ + chained_operators = [] + + # cdo vertsum infile outfile + if self.sum_levels: + chained_operators.append("-vertsum") + + # ATTENTION only relevant for "monthly" data! + # cdo -setday,1 -settime,00:00:00 + if self.correct_time_axis: + if "mon" in str(input_file): + chained_operators.append("-setday,1") + chained_operators.append("-settime,00:00:00") + else: + warnings.warn("Skipping correcting time axis, since it should only be applied to monthly data.") + + if self.correct_calendar: + # default calendar: 365_day + chained_operators.append(f"-setcalendar,{self.calendar}") + + # run commands + commands = ["cdo", "-s", "-w", "-L", "-P", str(threads), *chained_operators, input_file, output_file] + subprocess.call(commands) diff --git a/climateset/processing/raw/input4mips_processing.py b/climateset/processing/raw/input4mips_processing.py new file mode 100644 index 0000000..9a1159f --- /dev/null +++ b/climateset/processing/raw/input4mips_processing.py @@ -0,0 +1,1173 @@ +import os +import re +import subprocess +import warnings +from copy import copy +from pathlib import Path +from typing import Union + +import pint +import xarray as xr +from tqdm import tqdm + +from climateset.processing.raw.abstract_raw_processing import AbstractRawProcesser +from climateset.processing.raw.checker import AbstractDirectoryChecker +from climateset.processing.raw.utils import create_generic_output_path +from climateset.utils import create_logger + +# +# Processing steps +# + +# private step +_CDO_PROCESSING = "_cdo_processing" + +# user available steps +CORRECT_NAMES = "correct_names" +FIX_CO2 = "fix_co2" +CREATE_FIRE_FILES = "create_fire_files" +CORRECT_UNITS = "correct_units" +CORRECT_CALENDAR = "correct_calendar" +CORRECT_TIME_AXIS = "correct_time_axis" +SUM_LEVELS = "sum_levels" +SUM_SECTORS = "sum_sectors" +CREATE_TOTALS = "create_totals" +MERGE_GHG = "merge_ghg" + +AVAILABLE_INPUT4MIPS_PROCESSING_STEPS = frozenset( + [ + CORRECT_NAMES, + FIX_CO2, + CREATE_FIRE_FILES, + CORRECT_UNITS, + CORRECT_CALENDAR, + CORRECT_TIME_AXIS, + SUM_LEVELS, + SUM_SECTORS, + CREATE_TOTALS, + MERGE_GHG, + ] +) + +SET_DAY = "-setday,1" +SET_TIME = "-settime,00:00:00" + +LOGGER = create_logger(__name__) + + +class Input4MipsEmissionProcesser(AbstractRawProcesser): + """ + Can be called to summarize emission over sectors. + + Only applied to Input4MIPs data. + """ + + def __init__( + self, + input_directory: Union[str, Path], + working_directory: Union[str, Path], + checker: AbstractDirectoryChecker = None, + processing_steps=None, + spat_load: str = "map", + temp_load: str = "mon", + ghg_list: list = None, + meta_data: Path = None, + fire_cases: bool = True, + force_sum_sectors: list = None, + input_freq: str = "mon", + input_res: str = "50_km", + ): + """Init emission / ghg processer + Args: + verify_units (bool): Makes sure that all ghg conform to the same units. + Default false because slow. + verify_resolution (bool): Checks if the res in the file name is the same like in the file. + Default false because slow. + verify_lonlat (bool): Checks if the longitude, latitude values are as expected. + Default false because slow. + verify_num_of_files (bool): Checks how many files are in the final dir. + raises a warning if more or less than one. + verify_variables (bool): Checks if the variables are also the expected vars. + fix_co2 (bool): Fixes co2 data if necessary. Needed for ssp scenarios! + Default false, because we are providing raw data where it's already fixed. + correct_names (bool): renames all files that are expected + to be biomassburning files. Attention! apply only to raw data, + will mess things up if applied later! + correct_units (bool): Default False, because it takes long. Corrects + the units according to meta data saved in the raw param file. + correct_calendar (bool): Makes sure that the right calendar is used. + correct_time_axis (bool): Makes sure that each file starts at the first + of each time-unit and shifts if not. + sum_levels (bool): Makes sure that all emissions are sumed over the + different levels and sectors. For AIR files, the emissions are + summed over 25 different height levels. For ANTHRO files, the + emissions are summed over 8 different sectors. Openburning and + biomassburning have only one level (sectors are diff in external + percentage files, see 'fire_cases'). + sum_sectors (bool): Makes sure that ghg are summarized over diff sectors. + Historical data is treated differently. Different versions + are stored in the output_dir for historical data, since different + climate models need different versions here. There is a param/config file + mapping the right climate models to the right sum_sector types. + create_totals (bool): For some tasks you might not want to use + emission files, but arrays of totals over a specific time frame. + With this you can creat either map or total data; monthly or yearly. + See spat_load and temp_load for that. + spat_load (str): Can be "map" or "tot". Map is a long / lat map. + Total is an array with the totals summed over the spatial axis. + temp_load (str): Can be "mon" or "year". Decides if the loaded data + is monthly or yearly averaged. Can be extended later on. + merge_ghg (list or bool): If True: + Simply merges all ghg that can be found in the sub_dir (e.g.). + Instead one can hand over a list, which means that those ghg + specifically will be merged into a new file. + ghg_list (list): The greenhouse gases (ghg) that should be merged in + merge_ghg. Should be the folder names used in the respective dir. + meta_data (Path): Default None. Only necessary when sum_sectors is + used and anthro fire files should be generated. + fire_cases (bool): If the different fire cases should be considered + during sum_sectors. + force_sum_sectors (list): Which GHG are forced to be summed (sum_sectors) even if not all + sectors (AIR, anthro, biomassburning/openburning) is available. + Default is ["CO2"], since CO2 is the only GHG that does not have + biomassburning/openburning files as the rest does. + input_freq (str): Default "mon". Only used for create_totals_subdir. + input_res (str): Default "50 km". Only used for create_totals_subdir. + create_fire_files (bool): Default false, because it only needs to be + created one time. + """ + # init abstract class with checks + super().__init__( + input_directory=input_directory, + working_directory=working_directory, + checker=checker, + processing_steps=processing_steps, + ) + + self.available_steps = { + CORRECT_NAMES: {"order": 1, "function": self.correct_names_subdir}, + FIX_CO2: {"order": 2, "function": self.fix_co2_subdir}, + CREATE_FIRE_FILES: {"order": 3, "function": self.create_anthro_fire_subdir}, + CORRECT_UNITS: {"order": 4, "function": self.correct_units_subdir}, + CORRECT_CALENDAR: {"order": 5, "function": self._correct_calendar}, + CORRECT_TIME_AXIS: {"order": 6, "function": self._correct_time_axis}, + SUM_LEVELS: {"order": 7, "function": self._sum_levels}, + _CDO_PROCESSING: {"order": 8, "function": self.cdo_preprocess_subdir}, + SUM_SECTORS: {"order": 9, "function": self.sum_sectors_subdir}, + CREATE_TOTALS: {"order": 10, "function": self.create_totals_subdir}, + MERGE_GHG: {"order": 11, "function": self.merge_ghg_subdir}, + } + + self.cdo_operators = [] + self.spat_load = spat_load + self.temp_load = temp_load + + # ghg list that should be merged + self.ghg_list = ghg_list + if not self.ghg_list: + self.ghg_list = ["BC_sum", "CH4_sum", "SO2_sum"] + self.meta_data = meta_data + self.fire_cases = fire_cases + self.force_sum_sectors = force_sum_sectors + if not self.force_sum_sectors: + self.force_sum_sectors = ["CO2"] + self.input_freq = input_freq + self.input_res = input_res + + # TODO CO2 baseline - where?? + # CO2 baseline: do this outside, because it means that the data looks + # different for each climate model. best case you just store that somewhere + # in a config, store that well and apply it in the dataloader for each model separatetly? + # self.substract_co2_baseline = substract_co2_baseline + + def add_correct_names_step(self): + self.processing_steps.append(CORRECT_NAMES) + + def add_fix_co2_step(self): + self.processing_steps.append(FIX_CO2) + + def add_create_fire_files(self): + self.processing_steps.append(CREATE_FIRE_FILES) + + def add_correct_units_step(self): + self.processing_steps.append(CORRECT_UNITS) + + def add_correct_calendar_step(self): + self.processing_steps.append(CORRECT_CALENDAR) + + def add_correct_time_axis_step(self): + self.processing_steps.append(CORRECT_TIME_AXIS) + + def add_sum_levels_step(self): + self.processing_steps.append(SUM_LEVELS) + + def add_sum_sectors_step(self): + self.processing_steps.append(SUM_SECTORS) + + def add_create_totals_step(self): + self.processing_steps.append(CREATE_TOTALS) + + def add_merge_ghg_step(self): + self.processing_steps.append(MERGE_GHG) + + def set_load_types(self, spat_load: str = "map", temp_load: str = "mon"): + """ + Setting types for the loading processer. Can be set to update the types, so the same processer instance can be + applied afterwards. + + Args: + spat_load (str): Can be "map" or "tot". Map is a long / lat map. + Total is an array with the totals summed over the spatial axis. + temp_load (str): Can be "mon" or "year". Decides if the loaded data + is monthly or yearly averaged. Can be extended later on. + """ + self.spat_load = spat_load + self.temp_load = temp_load + + def type_class_meta(self) -> str: + """Returns the name tag of the subclass.""" + return "input4mips" + + # TODO share this in process class with input4mips / cmip6 + def file_belongs_to_type(self, input_file: Path) -> bool: + """ + Check if the name input4mip(s) appears in the path name. + + Args: + input_file (Path): the file that should be checked for input4mips + Returns: + True if name indicates it's a cmip6 file, False if not. + """ + return bool(re.search("input4mip", str(input_file), re.IGNORECASE)) + + def preprocess_subdir( + self, + input_dir: Path, + cleaned_dir: Path, + processed_dir: Path, + load_dir: Path, + overwrite: bool = False, + silent: bool = False, + sum_sec_input_res: str = "50_km", + sum_sec_input_freq: str = "mon", + ): + """ + Applying all the stuff as decided in the init function. Most of the functions build upon each other, i.e. you + cannot apply store_totals if you haven't used merge_ghg before. To keep things modular the user can still decide + which ones to apply (e.g. because some things may have been applied earlier / are not necessary). Just be aware + that things break if you are not making sure that the order is followed. + + Args: + input_dir (Path): To which directory the processing should be applied. + cleaned_dir (Path): Where cleaned data should be stored. This is used + for the "preprocess" steps. + processed_dir (Path): Where processed data is stored. This is used + for sum_sectors. Simple preprocessing is directly applied on + raw data here. + load_dir (Path): Where data is stored that is strongly modified and + ready to be loaded. This is used for merge_ghg and store_totals. + overwrite (bool): If the data should be overwritten if it already + exists. Default False. + silent (bool): If this should be processed silently. + sum_sec_input_res: + sum_sec_input_freq: + """ + if any([process in self.processing_steps for process in [CORRECT_CALENDAR, CORRECT_TIME_AXIS, SUM_LEVELS]]): + self.processing_steps.append(_CDO_PROCESSING) + + process_list = [self.available_steps[step] for step in self.available_steps] + ordered_processes = sorted(process_list, key=lambda step: step["order"]) + + for _, process in ordered_processes: + # process["function"]() + print(process) + + def fix_co2_subdir(self, input_dir: Path): + """ + Fix CO2 files. + + Args: + input_dir (Path): All ssp CO2 files in that dir will be adapted + """ + print(f"Starting to fix co2 files in {input_dir}.") + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + if ("ssp" in file) and ("CO2" in file): + self._fix_co2(Path(path) / file) + + print("...Finished fixing CO2 files.") + + def _fix_co2(self, input_file: Path): + """ + Fix single co2 file. + + CDO can be applied again afterwards. + Memory issues might arise (core dumped). Not sure why. Runs on compute canada though. + Args: + input_file (Path): The co2 file that should be fixed. + """ + # ncpdq --ovr -a time,level,lon,lat input.nc output.nc + commands = ["ncpdq", "--ovr", "-a", "time,level,lon,lat", input_file, input_file] + subprocess.call(commands) + + def correct_names_subdir(self, input_dir: Path): + """ + Renames files (biomassburning) if needed. + + Args: + input_dir (Path): All files in that dir will be adapted + """ + print(f"Starting to rename biomassburning files in {input_dir}.") + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + self._rename_biomassburning_files(Path(path) / file) + + # remove empty subdirs + for p in input_dir.glob("**/*"): + if p.is_dir() and len(list(p.iterdir())) == 0: + os.removedirs(p) + + print(f"... Finished renaming biomassburning files in {input_dir}") + + # TODO test + def correct_units_subdir(self, input_dir: Path): + """ + Correcting the ghg units (if possible). + + Attention, this module + is slow since it is not handled with cdo. Do only if necessary. + Args: + input_dir (Path): All files in that dir will be adapted + """ + print(f"Starting to correct all units (if needed) in {input_dir}.") + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + self._correct_units(Path(path) / file) + + print(f"... Finished correcting the units (if needed) in {input_dir}") + + def _rename_biomassburning_files(self, input_file: Path): + """ + Rename file for biomassburning in case it is in the naked version (from the downloader). Originally, the files + are just called 'GHG' instead of GHG_em_biomassburning. Overwrites the files directly. Attention, + + apply this only directly to the raw data!! The Input4mips preprocesser + renames variables later to the pure / naked GHG version - do not rename + them! + Args: + input_file (Path): File that should be renamed + """ + # fetch the ghg + ghg = str(input_file.parents[0]).split("/")[-4] + # check if this is the naked case + if "_" not in ghg: + new_ghg = f"{ghg}_em_biomassburning" + new_path_name = Path(str(input_file).replace(ghg, new_ghg)) + new_path_name.parent.mkdir(parents=True, exist_ok=True) + # use cdo to rename the var internally + commands = [ + "cdo", + "-s", + "-L", + f"-chname,{ghg},{ghg}_em_biomassburning", + f"-setattribute,variable_id={ghg}_em_biomassburning", + input_file, + new_path_name, + ] + subprocess.call(commands) + + # remove the old file + input_file.unlink() + + # TODO test + def _correct_units(self, input_file: Path): + """ + Corrects units of a single file. + + Be aware this is slow if applied + to a large directory. Overwrites the old file + Args: + input_file (Path): The file whose units should be adapted. + """ + ureg = pint.UnitRegistry() + ds = xr.load_dataset(input_file) + + # only use the data variables (ignore bnds vars) + for var in ds.data_vars: + if "bnds" not in str(var): + # check the unit + found_unit = ds[var].units.replace("-", "^-") + desired_unit = self.desired_units[var].replace("-", "^-") + + # exit if it's the right one + if found_unit == desired_unit: + return + + else: + try: + ureg(found_unit) + ureg(desired_unit) + except pint.errors.UndefinedUnitError: + print( + "Unit is not defined for pint. Check here: https://github.com/hgrecco/pint/blob/master/pint/default_en.txt" + ) + return + # update ds + # this function works for any kind of unit transformations (if they can be transformed) + ds.update( + { + var: xr.apply_ufunc( + lambda x: ureg.Quantity(x, found_unit).to(desired_unit).magnitude, ds[var] + ) + } + ) + # update attrs + ds[var].attrs["units"] = desired_unit + # overwrite old file + ds.to_netcdf(input_file, mode="w", format="NETCDF4", engine="netcdf4") + + def merge_ghg_subdir( + self, input_dir: Path, output_dir: Path, ghg_list: list, ghg_sum_name: str = "ALL", overwrite: bool = False + ): + """ + Merging GHG emission files together. + + Can only be applied after + 'create_totals_subdir'! + Args: + input_dir (Path): Data dir that should be processed + output_dir (Path): Where the merged data should be stored + ghg_sum_name (str): How the dir and files where the merged data is + stored should be tagged. + overwrite (bool): If files should be overwritten when they already exist + """ + print(f"Start creating merged GHG emission files for {ghg_list} in {input_dir}...") + # create a dict with the ghg files that should be merged + file_dict = {} + # total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir)): + if len(files) > 0: + for file in files: + full_path = str(Path(path) / file) + # do this only for the ghg listed above: + for ghg in ghg_list: + if ghg in full_path: + ghg_file_str = full_path.replace(ghg, ghg_sum_name) + # add ghg group as key if they don't exist yet + if not (ghg_file_str in file_dict): + file_dict[ghg_file_str] = [ghg] + else: + file_dict[ghg_file_str].append(ghg) + + # create the needed dirs + for file, ghgs in file_dict.items(): + # create output path + second_path_part = file.split("input4mips/")[-1] + out_path = Path(output_dir / "input4mips" / second_path_part) + out_path.parent.mkdir(parents=True, exist_ok=True) + + # create input files + input_files = [file.replace(ghg_sum_name, ghg) for ghg in ghgs] + + if (not out_path.is_file()) or overwrite: + self._merge_ghg_files( + input_files, + out_path, + ) + + print(f"... Finished merging GHG {ghg_list} in {output_dir}.") + + def _merge_ghg_files(self, input_files: list, output_file: Path, threads: int = 1): + """ + Merging GHG emission files together. + + Can only be applied after + 'create_totals_subdir'! + Args: + input_files (list): List of files (Path) that should be merged. + output_file (Path): Where the merged data should be stored + threads (int): How many threads should be used. + """ + if output_file.is_file(): + output_file.unlink() + + commands = ["cdo", "-w", "-s", "-L", "-P", str(threads), "-merge", *input_files, output_file] + + subprocess.call(commands) + + def create_totals_subdir( + self, + input_dir: Path, + output_dir: Path, + spat: str = "map", + temp: str = "mon", + input_freq: str = "mon", + input_res: str = "50_km", + overwrite: bool = True, + ): + """ + Creating the data that can actually be loaded by pytorch. + + Args: + input_dir (Path): Data dir that should be processed + output_dir (Path): Where the merged data should be stored + spat (str): Can be "map" or "total". Map stores ghg emission maps + (longitude / latitude). Total sums the values over the spatial axes. + temp (str): Can be "mon" or "year". In case of year the data is summed + over years. Assumes that monthly data is used! + input_freq (str): The frequency of the input data (can only process) + one type after another, so this must be fixed. Default "mon". + input_res (str): The nominal resolution of the input data (can only process) + one type after another, so this must be fixed. Default "50_km". + overwrite (bool): If files should be overwritten when they already exist + """ + print(f"Start creating loadable emission data {input_dir}....") + if spat not in ["map", "total"]: + raise ValueError("Parameter spat must be either 'map' or 'total'.") + if temp not in ["mon", "year"]: + raise ValueError("Parameter temp must be either 'mon' or 'year'.") + + # Match all the files that belong together and contain mon (only difference is the year ending!) + # TODO move this function into an external one + file_dict = {} + # total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir)): + if len(files) > 0: + for file in files: + full_path = str(Path(path) / file) + if (input_res in full_path) and (input_freq in full_path): + year = int(file.split(".")[0].split("_")[-1]) + year_file_str = full_path.replace(str(year), "YEAR") + # add ghg group as key if they don't exist yet + if year_file_str not in file_dict: + file_dict[year_file_str] = [year] + else: + file_dict[year_file_str].append(year) + + # create the needed dirs + for file, years in file_dict.items(): + # create output path + second_path_part = file.split("input4mips/")[-1] + second_path_part = second_path_part.replace(input_res, f"{spat}_{input_res}") + second_path_part = second_path_part.replace(input_freq, f"{temp}") + second_path_part = second_path_part.replace("YEAR", f"{min(years)}-{max(years)}") + out_path = Path(output_dir / "input4mips" / second_path_part) + out_path.parent.mkdir(parents=True, exist_ok=True) + + # create input files + input_files = [file.replace("YEAR", str(y)) for y in years] + + if (not out_path.is_file()) or overwrite: + self._create_totals( + input_files, + out_path, + spat, + temp, + ) + + print(f"... Finished creating loadable emission data {output_dir}....") + + def _create_totals(self, input_files: list, output_path: Path, spat: str, temp: str, threads: int = 1): + """ + Creating totals for a given list of files for different spatial and temporal settings. + + Args: + input_files (list): List of paths that are used to create totals + output_path (Path): Where the merged totals should be stored. + spat (str): Can be "map" or "total". Map stores ghg emission maps + (longitude / latitude). Total sums the values over the spatial axes. + temp (str): Can be "mon" or "year". In case of year the data is summed + over years. Assumes that monthly data is used! + threads (int): number of threads + """ + # remove the file if it already exists + if output_path.is_file(): + output_path.unlink() + + # first basic command part + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + ] + + # add depending on cases + if spat == "total": + # commands.append("-fldsum") + # ATTTENTION this assumes that sum sectors has been applied before!! + ghg = str(input_files[0]).split("/")[-1].split("_")[2] + commands.append("expr,sum=fldsum({0});min=fldmin({0});max=fldmax({0});mean=fldmean({0});".format(ghg)) + + if temp == "year": + commands.append("-yearsum") + + # add relevant commands for all cases + commands.extend(["-mergetime", *input_files, output_path]) + + subprocess.call(commands) + + def _correct_calendar(self): + self.cdo_operators.append(f"-setcalendar,{self.calendar}") + + def _correct_time_axis(self): + # TODO add check for mon data only + self.cdo_operators.append(SET_DAY) + self.cdo_operators.append(SET_TIME) + + def _sum_levels(self): + self.cdo_operators.append("-vertsum") + + # TODO if a year / file is missing -> write a warning! + # TODO LATER add feature to check lon / lat consistency for input4mips + def cdo_preprocess_subdir(self, sub_dir: Path, output_dir: Path, overwrite: bool = False): + """ + Apply desired emission processing steps to a subdir and store it in output_dir. + + Can only be applied on raw input4mpis data, not on those + that have already been processed (i.e. merged / summed up). + Args: + sub_dir (Path): Where the preprocessing should be applied + output_dir (Path): Where the results should be stored + overwrite (bool): If the data should be overwritten if it already exists + """ + # 1. apply the "preprocessing" - all the functions that can + # be applied to a single file + print(f"Start preprocessing of emission files {sub_dir}....") + total_files = len(list(sub_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(sub_dir), total=total_files): + if len(files) > 0: + for file in files: + # create output dir + input_file = Path(path) / file + if self.file_belongs_to_type(input_file): + output_file = create_generic_output_path(output_dir, path, file) + # skip if file already exists and we dont wanna overwrite it + if (not output_file.is_file()) or overwrite: + print(f"\nProcessing the following file: {input_file}") + self._cdo_preprocess(input_file, output_file) + + print(f"...Finished preprocessing \nof {sub_dir} and saved it at {output_dir}.") + + def _cdo_preprocess(self, input_file: Path, output_file: Path, threads: int = 1, cdo_operators=None): + """Applies all the emission processing tasks defined during init on + a given netcdf file - but only those ones that can operate on a single + file. + Args: + input_file (Path): The file that should be processed. + output_file (Path): Where the resulting file is stored. + threads (int): threads for cdo + """ + # RENAME variables in case of biomassburning + # needs to be done beforehand becauses messes too much stuff up + # is done immediately in place + if "em_biomassburning" in str(input_file): + ds = xr.load_dataset(input_file) + var = ds.variable_id + old_name = str(input_file) + renamed_file = Path(old_name.replace(".nc", "_renamed.nc")) + if "em_biomassburning" not in var: + commands = [ + "cdo", + "-s", + "-L", + f"-chname,{var},{var}_em_biomassburning", + f"-setattribute,variable_id={var}_em_biomassburning", + input_file, + renamed_file, + ] + subprocess.call(commands) + # rename files & reassign the input_file + input_file.unlink() + input_file = renamed_file.rename(old_name) + + chained_operators = copy(self.cdo_operators) + # cdo vertsum infile outfile [CHECKED] + # AIR: sum over height levels + # anthro: sum over different sectors + + # ATTENTION only relevant for "monthly" data! + # cdo -setday,1 -settime,00:00:00 + # TODO add check for mon data only + if "mon" not in "": + chained_operators.remove(SET_DAY) + chained_operators.remove(SET_TIME) + LOGGER.warning("Tried to set the time axis on non-monthly file; removing related cdo operators for file.") + + # run commands + commands = ["cdo", "-s", "-w", "-L", "-P", str(threads), *chained_operators, input_file, output_file] + subprocess.call(commands) + + # unit_key_list = [ + # "units", "unit", + # "variable_units", "variable_unit", + # "var_units", "var_unit" + # ] + + # for k in unit_key_list: + # try: + # # TODO try to get the units + # pass + # # if successful: break loop + # except AttributeError: + # # TODO continue + # continue + # if unit is None: + # raise KeyError("Unit could not be found in file {}".format(input_file)) + # else: + # # TODO check if the unit is the one saved in the jason file + # calendar = self.meta_raw_dict["calendar"][var] + # pass + # simply raises an error in case we encounter units that we dont expect + + # TODO break this into two functions + def sum_sectors_subdir( + self, + sub_dir: Path, + output_dir: Path, + force_sum_sectors: list = None, + fire_cases: bool = False, + overwrite: bool = True, + silent: bool = False, + input_res: str = "50_km", + input_freq: str = "mon", + ): + """ + Args: + force_sum_sectors (str): Default is empty, i.e. no merge is forced. + Put the name of GHG here that should be forced to merge. + fire_cases (bool): Default False means that simply all the data is + summed up and that's it. In case this is set to true, three different + things are calculated: anthro-fires, no-fires, all-fires. Can + only be done if "em_openburning" or "em_biomassburning" files + are included. + overwrite (bool): if the files should be overwritten if they already exist + silent (bool): If this should be processed silently. + """ + print(f"Start summing up different types of emission files in {sub_dir}...") + if not force_sum_sectors: + force_sum_sectors = [] + # params of the files to sum over + # TODO make these params in a config + # TODO find out which files are skipped + # res = "50_km" + # freq = "mon" + # years = [] + # dicts needed to match files together + # "example_path_SECTOR_sth": [None, None, None] + ssp_file_dict = {} + historical_file_dict = {} + sector_idx = {"em_AIR_anthro": 0, "em_anthro": 1, "em_openburning": 2, "em_biomassburning": 2} + + # match all files that have the exact same name and end on + total_files = len(list(sub_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(sub_dir), total=total_files): + if len(files) > 0: + for file in files: + full_path = str(Path(path) / file) + # only process the files with the right res and frequency + if (input_res in full_path) and (input_freq in full_path): + # sector descriptions + # check if this is historical or ssp and match + if "ssp" in file: + sector = re.search(r"(em_AIR_anthro|em_anthro|em_openburning)", full_path).group(0) + sector_file_str = full_path.replace(sector, "SECTOR") + # add ghg group as key if they don't exist yet + if sector_file_str not in ssp_file_dict: + ssp_file_dict[sector_file_str] = [None, None, None] + # add the found sector at the right sector index + # ssp_file_dict[sector_file_str][sector_idx[sector]] = full_path + ssp_file_dict[sector_file_str][sector_idx[sector]] = sector + + elif "historical" in file: + sector = re.search(r"(em_AIR_anthro|em_anthro|em_biomassburning)", full_path).group(0) + sector_file_str = full_path.replace(sector, "SECTOR") + # add ghg group as key if they don't exist yet + if sector_file_str not in historical_file_dict: + historical_file_dict[sector_file_str] = [None, None, None] + # add the sector at the right sector index + # history_file_dict[sector_file_str][sector_idx[sector]] = full_path + historical_file_dict[sector_file_str][sector_idx[sector]] = sector + + else: + raise ValueError( + "Input4mips files must include the scenario ('ssp' or 'historical') in their name." + ) + + self._sum_sector_cases( + file_dict=ssp_file_dict, + output_dir=output_dir, + fire_cases=fire_cases, + force_sum_sectors=force_sum_sectors, + overwrite=False, + silent=silent, + ) + + self._sum_sector_cases( + file_dict=historical_file_dict, + output_dir=output_dir, + fire_cases=fire_cases, + force_sum_sectors=force_sum_sectors, + overwrite=False, + silent=silent, + ) + + print(f"...Finished summing up ghg emissions and stored results in {output_dir}") + + def _sum_sector_cases( + self, + file_dict: dict, + output_dir: Path, + fire_cases: bool, + force_sum_sectors: list = None, + overwrite: bool = False, + silent: bool = False, + ): + """ + Calls sum sector for different cases. + + Args: + file_dict (dict): "example_path_SECTOR_sth": [file_path, file_path, file_path] + output_dir (Path): output directory, where to save the diff cases. + fire_cases (bool): if the diff fire cases should be considered + force_sum_sectors (str): Default is empty, i.e. no merge is forced. + Put the name of GHG here that should be forced to merge. + overwrite (bool): if the files should be overwritten if they already exist + silent (bool): If this should be processed silently. + """ + if not force_sum_sectors: + force_sum_sectors = [] + + for file, sectors in tqdm(file_dict.items()): + num_nones = sum([sec is None for sec in sectors]) + + # create file lists and names for the case where everything is added + ghg = file.split("input4mips/")[-1].split("/")[1].split("_")[0] + input_files = [Path(file.replace("SECTOR", sec)) for sec in sectors if sec is not None] + sector_list = [f"{ghg}_{sec}" for sec in sectors if sec is not None] + out_path = Path(output_dir / "input4mips" / file.split("input4mips/")[-1].replace("SECTOR", "sum")) + out_path.parent.mkdir(parents=True, exist_ok=True) + + if num_nones == 0: + # normal case + if fire_cases: + self.sum_sectors_fires(input_files, sector_list, out_path, overwrite=overwrite) + else: + self._sum_sectors(input_files, sector_list, out_path, overwrite=overwrite) + + elif (num_nones == 1) and (ghg in force_sum_sectors): + # just the simple summing if desired + warning + if not silent: + warnings.warn(f"Merged files for {file}, however some sectors are missing", stacklevel=2) + self._sum_sectors(input_files, sector_list, out_path, overwrite=overwrite) + + elif (num_nones == 1) and ghg not in force_sum_sectors: + if not silent: + warnings.warn( + "Skipping case {}. Not all three sectors are available. You can force merging with 'force_sum_sectors=[GHG1, GHG2]'.".format( + file + ), + stacklevel=2, + ) + + elif num_nones == 2: + # warning, skipping + if not silent: + warnings.warn( + f"Skipping case {file}. At least two files must exist to be summed together.", + stacklevel=2, + ) + else: + raise RuntimeError("'num_nones' must be between 0 and 3.") + + def set_meta_data(self, meta_data: Path): + """Set meta data path if not done during initialization.""" + self.meta_data = meta_data + + def create_anthro_fire_subdir(self, input_dir: Path, overwrite: bool = False): + """ + Creating anthropogenic fire files for a subdir. + + Args: + input_dir (Path): Input dir + overwrite (bool): If data should be overwritten. Default False. + """ + # loop through input dir, get biomassburning and openburning files + print(f"Starting to create anthropogenic fire files for files in {input_dir}.") + fire_output_dir = self.meta_data / "anthro-fire-data" + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + input_file = Path(path) / file + # must match the right nominal and temporal resolution + if ("biomassburning" in file) and ("25_km" in file) and ("mon" in file): + type = "biomassburning" + elif ("openburning" in file) and ("50_km" in file) and ("mon" in file): + type = "openburning" + else: + continue + scenario = file.split("_")[1] + ghg = file.split("_")[2] + fire_name = file.replace("_em_", "_anthro_") + output_file = Path(fire_output_dir / type / scenario / ghg / fire_name) + output_file.parent.mkdir(parents=True, exist_ok=True) + if (not output_file.is_file()) or overwrite: + self._create_anthro_fire_file(input_file, type, output_file) + + print(f"... Finished creating anthro fire files and stored them in {fire_output_dir}") + + def _create_anthro_fire_file(self, input_file: Path, type: str, output_file: Path, threads: int = 1): + """ + Create a new openburning or biomassburning file from a given file. + + Returns the path where the new file lifes. + Args: + input_file (Path): + type (str): either 'biomassburning' or 'openburning' + output_file (Path): Where to store the anthro fire file + threads (int): threads for cdo + """ + if self.meta_data is None: + print("You need to provide meta data for this function. Use set_meta_data func for that.") + file_parts = str(input_file.stem).split("_") + scenario, ghg, year = file_parts[1], file_parts[2], file_parts[-1] + meta_dir = self.meta_data / type + + # prepare additional operators (calendar and time-axis) for later compatability + chained_operators = [] + # correct calendar and time axis right away if necessary + if self.correct_time_axis: + chained_operators.append("-setday,1") + chained_operators.append("-settime,00:00:00") + if self.correct_calendar: + # default calendar: 365_day + chained_operators.append(f"-setcalendar,{self.calendar}") + + if type == "openburning": + # find a file that contains openburning, scenario and ghg in the meta_data / "future-openburning" + meta_dir = self.meta_data / "future-openburning" + matched_files = [ + f for f in meta_dir.rglob("*.nc") if (scenario in str(f)) and (type in str(f)) and (ghg in str(f)) + ] + # take first file that matches that + raw_percentage_file = matched_files[0] + tmp_perc_file = input_file.parents[0] / f"tmp_percentage_{year}.nc" + + # 1. drop the years you don't need from the meta file. -selyear,2015 + # 2. drop the levels you dont need - sellevel,0,1 + # 3. sum the remaining levels / sectors -vertsum + # -> cdo -vertsum -sellevel,0,1 -selyear,year meta_input.nc 2015_share_output.nc + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + "-vertsum", + "-sellevel,0,1", # 0 is agriculture, 1 is deforestation + f"-selyear,{year}", + raw_percentage_file, + tmp_perc_file, + ] + subprocess.call(commands) + + # multiply -> cdo mul fire_file.nc percentage_file new_anthro_fire_file.nc + # [IMPORTANT first file must be input-file -> gets meta data from first input file!] + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + *chained_operators, + "-mul", + input_file, + tmp_perc_file, + output_file, + ] + subprocess.call(commands) + + # remove 2015_share_output file to save space (unlink) + tmp_perc_file.unlink() + + elif type == "biomassburning": + meta_dir = self.meta_data / "historic-biomassburning" + matched_files = [ + f for f in meta_dir.rglob("*.nc") if (type in str(f)) and (ghg in str(f)) and (year in str(f)) + ] + + # find the agri / defo files (written this way for later extension) + perc_dict = { + "AGRI": None, + "DEFO": None, + } + for f in matched_files: + for key in perc_dict.keys(): + if key in str(f): + perc_dict[key] = f + if (perc_dict["AGRI"] is None) or (perc_dict["DEFO"] is None): + warnings.warn("The AGRI or DEFO file does not exist, see dictionary: \n", stacklevel=2) + print(perc_dict) + print("... skipping this file.") + return + + # cdo 'expr,percentage_AGRI=percentage_AGRI/100' add percentage_AGRI_year.nc percentage_DEFO_year.nc percentage_ANTHRO_year.nc + tmp_anthro_perc_file = input_file.parents[0] / f"tmp_percentage_anthro_{year}.nc" + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + "expr,percentage_AGRI=percentage_AGRI/100;", + "-add", + perc_dict["AGRI"], + perc_dict["DEFO"], + tmp_anthro_perc_file, + ] + subprocess.call(commands) + + # cdo mul input_file.nc percentage_ANTHRO_year.nc output.nc + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + *chained_operators, + "-mul", + input_file, + tmp_anthro_perc_file, + output_file, + ] + subprocess.call(commands) + + # delete temp files + tmp_anthro_perc_file.unlink() + + # BIOMASSBURNING + # TODO get the right biomassburning files: year, GHG, biomassburning and AGRI + DEFO + # 1. cdo 'expr,percentage_AGRI=percentage_AGRI/100' -selyear,year meta_input.nc percentage_AGRI_year.nc + # 2. cdo 'expr,percentage_DEFO=percentage_DEFO/100' -selyear,year meta_input.nc percentage_DEFO_year.nc + # 3. cdo add percentage_AGRI_year.nc percentage_DEFO_year.nc percentage_ANTHRO_year.nc + # 4. cdo mul input_file.nc percentage_ANTHRO_year.nc output.nc + # 5 remove old files: all the percentage files + else: + raise ValueError("Type must be either 'openburing' or 'biomassburning'.") + + def sum_sectors_fires(self, input_files: list, sectors: list, output_path: Path, overwrite: bool = True): + """ + Args: + input_files (list): List of paths. The paths contain different + sectors and the information is summed up and stored in the + desired output_file. + sectors (list): List of strings. The different files that are stored + in input_files. The variable names of input_files. + output_path (Path): File where the resulting emissions are stored. + The output_path is adapted in case different fire cases are created. + overwrite (bool): Default is True, i.e. data will be overwritten if it already exists + + """ + # check where (and if) biomassburning or openburning are contained + fire_idx = None + for i, var in enumerate(sectors): + if ("openburning" in var) or ("biomassburning" in var): + fire_idx = i + if fire_idx is None: + raise ValueError( + "Fire cases can only be created if openburning or biomassburning files are given. (Must be contained in the file name as well)." + ) + + # Get ghg and fire type + ghg = sectors[0].split("_")[0] + # fire_type = sectors[fire_idx].split("_")[-1] + + # CASE 1: ALL FIRES + all_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_all-fires_")) + all_fire_files = input_files + all_fire_sectors = sectors + self._sum_sectors(all_fire_files, all_fire_sectors, all_fire_output_path, overwrite=overwrite) + + # CASE 2: ANTHRO FIRES + anthro_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_anthro-fires_")) + anthro_fire_sectors = sectors + # get the right anthro fire file from meta + # 1. get the current fire file + all_fire_file = input_files[fire_idx] + anthro_file_name = str(all_fire_file.name).replace("_em_", "_anthro_") + potential_anthro_files = [f for f in self.meta_data.rglob(anthro_file_name)] + if len(potential_anthro_files) < 1: + print(f"No anthro fire file found for {all_fire_file.name}.") + pass + else: + if len(potential_anthro_files) > 1: + print(f"Several fire files found for {all_fire_file.name}. Choosing only the first one.") + anthro_fire_files = input_files.copy() + anthro_fire_files[fire_idx] = potential_anthro_files[0] + self._sum_sectors(anthro_fire_files, anthro_fire_sectors, anthro_fire_output_path, overwrite=overwrite) + + # CASE 3: NO FIRES + # drop the openburning or biomassburning sectors and files + no_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_no-fires_")) + no_fire_files = input_files.copy() + no_fire_sectors = sectors.copy() + no_fire_files.pop(fire_idx) + no_fire_sectors.pop(fire_idx) + self._sum_sectors(no_fire_files, no_fire_sectors, no_fire_output_path, overwrite=overwrite) + + def _sum_sectors( + self, input_files: list, sectors: list, output_path: Path, overwrite: bool = True, threads: str = 1 + ): + """ + Args: + input_files (list): List of paths. The paths contain different + sectors and the information is summed up and stored in the + desired output_file. + sectors (list): List of strings. The different files that are stored + in input_files. The variable names of input_files. + output_path (Path): File where the resulting emissions are stored. + The output_path is adapted in case different fire cases are created. + overwrite (bool): True, i.e. if the file in output_path already + exists, it is overwritten. Set to False if you don't wanna overwrite + these files. + threads (str): Default 1. + + """ + # exit this if the file already exists and we dont wanna overwrite them + if output_path.is_file() and (not overwrite): + return + + ghg = sectors[0].split("_")[0] + # math sum expression + sum_expr = f"{ghg}={sectors[0]}" + for sec in sectors[1:]: + sum_expr = f"{sum_expr}+{sec}" + + # cdo expr,'var_new=var1+var2;' -merge infile1 infile2 outfile # works only on cmd + # cdo 'expr,var_new=var1+var2;' -merge infile1 infile2 outfile # works for both + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + f"expr,{sum_expr};", + "-merge", + *input_files, + output_path, + ] + subprocess.call(commands) diff --git a/climateset/processing/raw/utils.py b/climateset/processing/raw/utils.py new file mode 100644 index 0000000..cfaeadd --- /dev/null +++ b/climateset/processing/raw/utils.py @@ -0,0 +1,18 @@ +from pathlib import Path + + +def create_generic_output_path(output_dir: Path, path: str, file: str) -> Path: + """Creates an output path and the necessary parent directories. + 'Generic' in the sense that file names are not adapted as in other + create_output_path functions used in the resolution processers. + Args: + output_dir (Path): First part of the path for the output dir + path (str): Can e.g. stem from os.walk - path of the current file. + file (str): Can e.g. stem from os.walk - current file. + Returns: + Path: New path pointing where the output file can be stored. + """ + topic_dir = file.split("_")[0] + out_path = Path(output_dir / topic_dir / Path(path.split(f"{topic_dir}/")[-1]) / file) + out_path.parent.mkdir(parents=True, exist_ok=True) + return out_path diff --git a/climateset/processing/resolution/__init__.py b/climateset/processing/resolution/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/configs/processing/raw_processing_params.json b/configs/processing/raw_processing_params.json new file mode 100644 index 0000000..a119551 --- /dev/null +++ b/configs/processing/raw_processing_params.json @@ -0,0 +1,52 @@ +{ + "calendar": "365_day", + "units": { + "tas": "K", + "pr": "kg m-2 s-1", + "SO2_em_AIR_anthro": "kg m-2 s-1", + "SO2_em_anthro": "kg m-2 s-1", + "SO2_em_openburning": "kg m-2 s-1", + "SO2_em_biomassburning": "kg m-2 s-1", + "SO2_anthro_biomassburning": "kg m-2 s-1", + "CO2_em_AIR_anthro": "kg m-2 s-1", + "CO2_em_anthro": "kg m-2 s-1", + "CO2_em_openburning": "kg m-2 s-1", + "CO2_em_biomassburning": "kg m-2 s-1", + "CO2_anthro_biomassburning": "kg m-2 s-1", + "CH4_em_AIR_anthro": "kg m-2 s-1", + "CH4_em_anthro": "kg m-2 s-1", + "CH4_em_openburning": "kg m-2 s-1", + "CH4_em_biomassburning": "kg m-2 s-1", + "CH4_anthro_biomassburning": "kg m-2 s-1", + "BC_em_AIR_anthro": "kg m-2 s-1", + "BC_em_anthro": "kg m-2 s-1", + "BC_em_openburning": "kg m-2 s-1", + "BC_em_biomassburning": "kg m-2 s-1", + "BC_anthro_biomassburning": "kg m-2 s-1" + }, + "50_km": { + "input4mips_lon": { + "min": -179.75, + "max": 179.75, + "step": 0.5 + }, + "input4mips_lat": { + "min": -89.75, + "max": 89.75, + "step": 0.5 + } + }, + "250_km": { + "cmip6_lon": { + "min": 0, + "max": 357.5, + "step": 2.5 + }, + "cmip6_lat": { + "min": -90, + "max": 90, + "step": 1.8947368421052602 + } + } + +} diff --git a/tests/test_processing/__init__.py b/tests/test_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_processing/test_raw_processing.py b/tests/test_processing/test_raw_processing.py new file mode 100644 index 0000000..276eb57 --- /dev/null +++ b/tests/test_processing/test_raw_processing.py @@ -0,0 +1,61 @@ +from unittest.mock import patch + +import pytest +from xarray import open_dataset + +from climateset import DATA_DIR +from climateset.processing.raw.checker import BasicDirectoryChecker +from climateset.processing.raw.input4mips_processing import ( + AVAILABLE_INPUT4MIPS_PROCESSING_STEPS, + Input4MipsEmissionProcesser, +) + + +@pytest.fixture() +def mock_xarray_load_dataset(): + with patch("xarray.load_dataset", new=open_dataset) as mock_function: + yield mock_function + + +@pytest.fixture +def simple_emission_processing_object(): + data_dir = DATA_DIR / "raw/raw_input_vars/PNNL-JGCRI/CO2_em_anthro" + return Input4MipsEmissionProcesser(input_directory=data_dir) + + +def test_emission_processing_init(simple_emission_processing_object): + assert simple_emission_processing_object.input_directory == DATA_DIR / "raw/raw_input_vars/PNNL-JGCRI/CO2_em_anthro" + assert isinstance(simple_emission_processing_object.checker, BasicDirectoryChecker) + + +def test_emission_processing_basic_directory_check(simple_emission_processing_object, mock_xarray_load_dataset): + check_results = simple_emission_processing_object.checker.check_directory() + assert check_results + + +def test_emission_processing_check_available_steps(simple_emission_processing_object): + steps = simple_emission_processing_object.list_available_steps() + is_step_available = [step in AVAILABLE_INPUT4MIPS_PROCESSING_STEPS for step in steps] + assert all(is_step_available) + + +def test_emission_processing_add_processing_steps(simple_emission_processing_object): + step_list = list(AVAILABLE_INPUT4MIPS_PROCESSING_STEPS) + simple_emission_processing_object.add_processing_step(step_list) + is_step_added = [step in step_list for step in simple_emission_processing_object.processing_steps] + assert all(is_step_added) + + +def test_emission_processing_add_individual_steps(simple_emission_processing_object): + simple_emission_processing_object.add_correct_names_step() + simple_emission_processing_object.add_fix_co2_step() + simple_emission_processing_object.add_create_fire_files() + simple_emission_processing_object.add_correct_units_step() + simple_emission_processing_object.add_correct_calendar_step() + simple_emission_processing_object.add_correct_time_axis_step() + simple_emission_processing_object.add_sum_levels_step() + simple_emission_processing_object.add_sum_sectors_step() + simple_emission_processing_object.add_create_totals_step() + simple_emission_processing_object.add_merge_ghg_step() + for step in simple_emission_processing_object.processing_steps: + assert step in list(AVAILABLE_INPUT4MIPS_PROCESSING_STEPS) From 8097ac2cca9919a54521c267a1da8888ceecdf13 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Fri, 1 Nov 2024 17:34:56 -0400 Subject: [PATCH 03/10] Fix some f-string and make flynt run before black --- .pre-commit-config.yaml | 12 ++++++------ climateset/processing/raw/checker.py | 19 +++++++------------ noxfile.py | 4 ++-- 3 files changed, 15 insertions(+), 20 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5d4254c..8489f55 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,6 +17,12 @@ repos: - id: check-added-large-files args: ["--maxkb=5000"] + - repo: https://github.com/ikamensh/flynt + rev: 1.0.1 + hooks: + - id: flynt + args: ["--fail-on-change"] + - repo: https://github.com/psf/black rev: 23.12.1 hooks: @@ -27,12 +33,6 @@ repos: hooks: - id: isort - - repo: https://github.com/ikamensh/flynt - rev: 1.0.1 - hooks: - - id: flynt - args: ["--fail-on-change"] - - repo: https://github.com/PyCQA/docformatter rev: v1.7.5 hooks: diff --git a/climateset/processing/raw/checker.py b/climateset/processing/raw/checker.py index 037bd8c..01df48c 100644 --- a/climateset/processing/raw/checker.py +++ b/climateset/processing/raw/checker.py @@ -287,9 +287,8 @@ def check_variables(self): if actual_var != named_var: self.results["var_ok"] = False LOGGER.warning( - "The following file contains the variable {}, but the variable {} was expected: \n{}".format( - actual_var, named_var, self.input_file - ) + f"The following file contains the variable {actual_var}, but the variable {named_var} " + f"was expected: \n{self.input_file}" ) else: self.results["var_ok"] = True @@ -337,9 +336,8 @@ def check_resolution(self): if actual_spat_res != named_spat_res: self.results["res_ok"] = False LOGGER.warning( - "The following file has the nominal resolution {}, but the resolution {} was expected: \n{}".format( - actual_spat_res, named_spat_res, self.input_file - ) + f"The following file has the nominal resolution {actual_spat_res}, but the resolution " + f"{named_spat_res} was expected: \n{self.input_file}" ) else: self.results["res_ok"] = True @@ -349,9 +347,8 @@ def check_resolution(self): if actual_temp_res != named_temp_res: self.dataset["freq_ok"] = False LOGGER.warning( - "The following file has the temporal frequency {}, but the frequency {} was expected: \n{}".format( - actual_temp_res, named_temp_res, self.input_file - ) + f"The following file has the temporal frequency {actual_temp_res}, but the frequency " + f"{named_temp_res} was expected: \n{self.input_file}" ) else: self.dataset["freq_ok"] = True @@ -377,9 +374,7 @@ def check_units_ok(self): if found_unit != expected_unit: self.results["units_ok"] = False LOGGER.warning( - "The following file has the unit {}, but the unit {} was expected: \n{}".format( - found_unit, expected_unit, self.input_file - ) + f"The following file has the unit {found_unit}, but the unit {expected_unit} was expected: \n{self.input_file}" ) else: self.results["units_ok"] = True diff --git a/noxfile.py b/noxfile.py index 42460cb..888c791 100644 --- a/noxfile.py +++ b/noxfile.py @@ -55,9 +55,9 @@ def docformatter(session): @nox.session() def check(session): paths = get_paths(session) + session.run("poetry", "run", "flynt", *paths["all"], external=True) session.run("poetry", "run", "black", "--check", *paths["all"], external=True) session.run("poetry", "run", "isort", *paths["all"], "--check", external=True) - session.run("poetry", "run", "flynt", *paths["all"], external=True) session.run( "poetry", "run", @@ -74,9 +74,9 @@ def check(session): @nox.session() def fix(session): paths = get_paths(session) + session.run("poetry", "run", "flynt", *paths["all"], external=True) session.run("poetry", "run", "black", *paths["all"], external=True) session.run("poetry", "run", "isort", *paths["all"], external=True) - session.run("poetry", "run", "flynt", *paths["all"], external=True) session.run( "poetry", "run", From d362f8325fe5ed7c015e281138e8c80b67626d04 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Tue, 19 Nov 2024 17:17:11 -0500 Subject: [PATCH 04/10] Add preliminary pipeline class structure and move functions to processing. Not everything works, this is mostly a progress save --- climateset/__init__.py | 1 + .../processing/abstract_processor_step.py | 13 + ...rocessing.py => abstract_raw_processor.py} | 6 +- climateset/processing/raw/checker.py | 12 +- ...cmip6_processing.py => cmip6_processor.py} | 7 +- .../processing/raw/input4mips/__init__.py | 0 .../raw/input4mips/input4mips_processor.py | 433 ++++++ .../processing/raw/input4mips/processing.py | 901 +++++++++++++ .../processing/raw/input4mips_processing.py | 1173 ----------------- climateset/processing/raw/utils.py | 4 + tests/test_processing/test_raw_processing.py | 4 +- tests/test_processing/test_utils.py | 11 + 12 files changed, 1374 insertions(+), 1191 deletions(-) create mode 100644 climateset/processing/abstract_processor_step.py rename climateset/processing/raw/{abstract_raw_processing.py => abstract_raw_processor.py} (95%) rename climateset/processing/raw/{cmip6_processing.py => cmip6_processor.py} (98%) create mode 100644 climateset/processing/raw/input4mips/__init__.py create mode 100644 climateset/processing/raw/input4mips/input4mips_processor.py create mode 100644 climateset/processing/raw/input4mips/processing.py delete mode 100644 climateset/processing/raw/input4mips_processing.py create mode 100644 tests/test_processing/test_utils.py diff --git a/climateset/__init__.py b/climateset/__init__.py index ab49175..067b165 100644 --- a/climateset/__init__.py +++ b/climateset/__init__.py @@ -6,6 +6,7 @@ DATA_DIR = PROJECT_ROOT / "data" RAW_DATA = DATA_DIR / "raw" PROCESSED_DATA = DATA_DIR / "processed" +FINAL_DATA = DATA_DIR / "final" LOAD_DATA = DATA_DIR / "load" META_DATA = DATA_DIR / "meta" SCRIPT_DIR = PROJECT_ROOT / "scripts" diff --git a/climateset/processing/abstract_processor_step.py b/climateset/processing/abstract_processor_step.py new file mode 100644 index 0000000..33c2d3c --- /dev/null +++ b/climateset/processing/abstract_processor_step.py @@ -0,0 +1,13 @@ +from abc import ABC, abstractmethod + + +class AbstractProcessorStep(ABC): + def __init__(self): + self.results_directory = None + + @abstractmethod + def execute(self, input_directory): + pass + + def get_results_directory(self): + return self.results_directory diff --git a/climateset/processing/raw/abstract_raw_processing.py b/climateset/processing/raw/abstract_raw_processor.py similarity index 95% rename from climateset/processing/raw/abstract_raw_processing.py rename to climateset/processing/raw/abstract_raw_processor.py index e94b535..3f57f6e 100644 --- a/climateset/processing/raw/abstract_raw_processing.py +++ b/climateset/processing/raw/abstract_raw_processor.py @@ -12,7 +12,7 @@ LOGGER = create_logger(__name__) -class AbstractRawProcesser(ABC): +class AbstractRawProcessor(ABC): """Abstract class for raw processing.""" def __init__( @@ -53,7 +53,7 @@ def type_class_meta(self) -> str: """Returns the name tag of the subclass.""" @abstractmethod - def preprocess_subdir( + def process_directory( self, input_dir: Path, cleaned_dir: Path, @@ -99,6 +99,6 @@ def add_processing_step(self, steps: Union[str, list]): self.processing_steps.append(step) def list_available_steps(self): - step_list = [step for step in self.available_steps.keys() if not step.startswith("_")] + step_list = [step for step in self.available_steps if not step.startswith("_")] LOGGER.info(f"Available steps: {step_list}") return step_list diff --git a/climateset/processing/raw/checker.py b/climateset/processing/raw/checker.py index 01df48c..61b7e17 100644 --- a/climateset/processing/raw/checker.py +++ b/climateset/processing/raw/checker.py @@ -178,18 +178,11 @@ def add_lonlat_check(self): def add_variables_check(self): self.checker_steps.append(VERIFY_VARIABLES) - def check_directory(self, log_file: Path = None, model: str = "") -> bool: + def check_directory(self) -> bool: """ Checking all files in a sub dir. TAKES TIME. - Args: - sub_dir (Path): The dir that should be checked - log_file (Path): Where problematic files should be logged - mode (str): 'w' writes the logs into the file, 'a' appends it to - an existing file. - model (str): Default is "". Set this, if you want that only - the data of this model is checked. Returns: bool: True if all checks were successful, False if not """ @@ -374,7 +367,8 @@ def check_units_ok(self): if found_unit != expected_unit: self.results["units_ok"] = False LOGGER.warning( - f"The following file has the unit {found_unit}, but the unit {expected_unit} was expected: \n{self.input_file}" + f"The following file has the unit {found_unit}, but the unit {expected_unit} " + f"was expected: \n{self.input_file}" ) else: self.results["units_ok"] = True diff --git a/climateset/processing/raw/cmip6_processing.py b/climateset/processing/raw/cmip6_processor.py similarity index 98% rename from climateset/processing/raw/cmip6_processing.py rename to climateset/processing/raw/cmip6_processor.py index 61a2711..0b43beb 100644 --- a/climateset/processing/raw/cmip6_processing.py +++ b/climateset/processing/raw/cmip6_processor.py @@ -9,13 +9,13 @@ import xmip.preprocessing as xmip_preprocessing from tqdm import tqdm -from climateset.processing.raw.abstract_raw_processing import AbstractRawProcesser +from climateset.processing.raw.abstract_raw_processor import AbstractRawProcessor from climateset.processing.raw.utils import create_generic_output_path # Attention: we need to write _desired_units for our all our data # Attention: Before you apply, make sure this is only applied to CMIP6 data! -class ClimateModelProcesser(AbstractRawProcesser): +class ClimateModelProcesser(AbstractRawProcessor): """ Can be called to apply xmip preprocessing. @@ -36,7 +36,6 @@ def __init__( correct_time_axis: bool = True, correct_calendar: bool = True, sum_levels: bool = True, - **kwargs, ): """Init function for xmip processer Args: @@ -111,7 +110,7 @@ def update_desired_units_dict(self, var: str, unit: str): xmip.preprocessing._desired_units[var] = unit # share this with input4mips? - def preprocess_subdir( + def process_directory( self, sub_dir: Path, output_dir: Path, diff --git a/climateset/processing/raw/input4mips/__init__.py b/climateset/processing/raw/input4mips/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/climateset/processing/raw/input4mips/input4mips_processor.py b/climateset/processing/raw/input4mips/input4mips_processor.py new file mode 100644 index 0000000..c15368d --- /dev/null +++ b/climateset/processing/raw/input4mips/input4mips_processor.py @@ -0,0 +1,433 @@ +import re +from pathlib import Path +from typing import Union + +from climateset import PROCESSED_DATA +from climateset.processing.abstract_processor_step import AbstractProcessorStep +from climateset.processing.raw.abstract_raw_processor import AbstractRawProcessor +from climateset.processing.raw.checker import AbstractDirectoryChecker +from climateset.processing.raw.input4mips.processing import ( + CDO_SET_CALENDAR, + CDO_SET_DAY, + CDO_SET_TIME, + CDO_VERTICAL_SUM, + cdo_process_directory, + correct_units_all_files_in_directory, + create_anthro_fire_directory, + create_emissions_totals, + merge_ghg_directory, + rename_biomassburning_files_in_directory, + sum_sectors_directory, +) +from climateset.utils import create_logger + +# +# Processing steps +# + +# private step +_CDO_PROCESSING = "_cdo_processing" + +# user available steps +CORRECT_NAMES = "correct_names" +REORDER_SSP_CO2 = "reorder_ssp_co2" +CREATE_FIRE_FILES = "create_fire_files" +CORRECT_UNITS = "correct_units" +CORRECT_CALENDAR = "correct_calendar" +CORRECT_TIME_AXIS = "correct_time_axis" +SUM_LEVELS = "sum_levels" +SUM_SECTORS = "sum_sectors" +CREATE_TOTALS = "create_totals" +MERGE_GHG = "merge_ghg" + +AVAILABLE_INPUT4MIPS_PROCESSING_STEPS = frozenset( + [ + CORRECT_NAMES, + REORDER_SSP_CO2, + CREATE_FIRE_FILES, + CORRECT_UNITS, + CORRECT_CALENDAR, + CORRECT_TIME_AXIS, + SUM_LEVELS, + SUM_SECTORS, + CREATE_TOTALS, + MERGE_GHG, + ] +) +ORDER = "order" +PROCESSOR_STEP = "processor_step" + +LOGGER = create_logger(__name__) + + +class Input4MipsEmissionProcessor(AbstractRawProcessor): + """ + Can be called to summarize emission over sectors. + + Only applied to Input4MIPs data. + """ + + def __init__( + self, + input_directory: Union[str, Path], + working_directory: Union[str, Path] = PROCESSED_DATA / "input4mips", + checker: AbstractDirectoryChecker = None, + processing_steps=None, + spat_load: str = "map", + temp_load: str = "mon", + ghg_list: list = None, + meta_data: Path = None, + fire_cases: bool = True, + force_sum_sectors: list = None, + input_freq: str = "mon", + input_res: str = "50_km", + ): + """Init emission / ghg processer + Args: + verify_units (bool): Makes sure that all ghg conform to the same units. + Default false because slow. + verify_resolution (bool): Checks if the res in the file name is the same like in the file. + Default false because slow. + verify_lonlat (bool): Checks if the longitude, latitude values are as expected. + Default false because slow. + verify_num_of_files (bool): Checks how many files are in the final dir. + raises a warning if more or less than one. + verify_variables (bool): Checks if the variables are also the expected vars. + fix_co2 (bool): Fixes co2 data if necessary. Needed for ssp scenarios! + Default false, because we are providing raw data where it's already fixed. + correct_names (bool): renames all files that are expected + to be biomassburning files. Attention! apply only to raw data, + will mess things up if applied later! + correct_units (bool): Default False, because it takes long. Corrects + the units according to meta data saved in the raw param file. + correct_calendar (bool): Makes sure that the right calendar is used. + correct_time_axis (bool): Makes sure that each file starts at the first + of each time-unit and shifts if not. + sum_levels (bool): Makes sure that all emissions are sumed over the + different levels and sectors. For AIR files, the emissions are + summed over 25 different height levels. For ANTHRO files, the + emissions are summed over 8 different sectors. Openburning and + biomassburning have only one level (sectors are diff in external + percentage files, see 'fire_cases'). + sum_sectors (bool): Makes sure that ghg are summarized over diff sectors. + Historical data is treated differently. Different versions + are stored in the output_dir for historical data, since different + climate models need different versions here. There is a param/config file + mapping the right climate models to the right sum_sector types. + create_totals (bool): For some tasks you might not want to use + emission files, but arrays of totals over a specific time frame. + With this you can creat either map or total data; monthly or yearly. + See spat_load and temp_load for that. + spat_load (str): Can be "map" or "tot". Map is a long / lat map. + Total is an array with the totals summed over the spatial axis. + temp_load (str): Can be "mon" or "year". Decides if the loaded data + is monthly or yearly averaged. Can be extended later on. + merge_ghg (list or bool): If True: + Simply merges all ghg that can be found in the sub_dir (e.g.). + Instead one can hand over a list, which means that those ghg + specifically will be merged into a new file. + ghg_list (list): The greenhouse gases (ghg) that should be merged in + merge_ghg. Should be the folder names used in the respective dir. + meta_data (Path): Default None. Only necessary when sum_sectors is + used and anthro fire files should be generated. + fire_cases (bool): If the different fire cases should be considered + during sum_sectors. + force_sum_sectors (list): Which GHG are forced to be summed (sum_sectors) even if not all + sectors (AIR, anthro, biomassburning/openburning) is available. + Default is ["CO2"], since CO2 is the only GHG that does not have + biomassburning/openburning files as the rest does. + input_freq (str): Default "mon". Only used for create_totals_subdir. + input_res (str): Default "50 km". Only used for create_totals_subdir. + create_fire_files (bool): Default false, because it only needs to be + created one time. + """ + # init abstract class with checks + super().__init__( + input_directory=input_directory, + working_directory=working_directory, + checker=checker, + processing_steps=processing_steps, + ) + self.cdo_operators = [] + self.spat_load = spat_load + self.temp_load = temp_load + + # ghg list that should be merged + self.ghg_list = ghg_list + if not self.ghg_list: + self.ghg_list = ["BC_sum", "CH4_sum", "SO2_sum"] + self.meta_data = meta_data + self.fire_cases = fire_cases + self.force_sum_sectors = force_sum_sectors + if not self.force_sum_sectors: + self.force_sum_sectors = ["CO2"] + self.input_freq = input_freq + self.input_res = input_res + + self.available_steps = { + REORDER_SSP_CO2: {ORDER: 1, PROCESSOR_STEP: RenameBiomassBurningFilesStep()}, + CORRECT_NAMES: {ORDER: 2, PROCESSOR_STEP: ReorderSSPCO2DimensionsStep()}, + CREATE_FIRE_FILES: {ORDER: 3, PROCESSOR_STEP: CreateAnthroFireFilesStep()}, + CORRECT_CALENDAR: {ORDER: 4, PROCESSOR_STEP: self._cdo_add_correct_calendar}, + CORRECT_TIME_AXIS: {ORDER: 5, PROCESSOR_STEP: self._cdo_add_correct_time_axis}, + SUM_LEVELS: {ORDER: 6, PROCESSOR_STEP: self._cdo_add_sum_levels}, + _CDO_PROCESSING: {ORDER: 7, PROCESSOR_STEP: CdoEmissionsProcessingStep(cdo_operators=self.cdo_operators)}, + CORRECT_UNITS: {ORDER: 8, PROCESSOR_STEP: CorrectUnitsStep(desired_units=self.desired_units)}, + SUM_SECTORS: { + ORDER: 9, + PROCESSOR_STEP: SumSectors(working_dir=self.working_directory, meta_data=self.meta_data), + }, + CREATE_TOTALS: { + ORDER: 10, + PROCESSOR_STEP: CreateEmissionsTotals( + working_dir=self.working_directory, + spat=self.spat_load, + temp=self.temp_load, + input_res=self.input_res, + input_freq=self.input_freq, + ), + }, + MERGE_GHG: {ORDER: 11, PROCESSOR_STEP: MergeGHG()}, + } + + # TODO CO2 baseline - where?? + # CO2 baseline: do this outside, because it means that the data looks + # different for each climate model. best case you just store that somewhere + # in a config, store that well and apply it in the dataloader for each model separatetly? + # self.substract_co2_baseline = substract_co2_baseline + + def add_correct_names_step(self): + self.processing_steps.append(CORRECT_NAMES) + + def add_reorder_ssp_co2_dimensions_step(self): + self.processing_steps.append(REORDER_SSP_CO2) + + def add_create_fire_files(self): + self.processing_steps.append(CREATE_FIRE_FILES) + + def add_correct_units_step(self): + self.processing_steps.append(CORRECT_UNITS) + + def add_correct_calendar_step(self): + self.processing_steps.append(CORRECT_CALENDAR) + + def add_correct_time_axis_step(self): + self.processing_steps.append(CORRECT_TIME_AXIS) + + def add_sum_levels_step(self): + self.processing_steps.append(SUM_LEVELS) + + def add_sum_sectors_step(self): + self.processing_steps.append(SUM_SECTORS) + + def add_create_totals_step(self): + self.processing_steps.append(CREATE_TOTALS) + + def add_merge_ghg_step(self): + self.processing_steps.append(MERGE_GHG) + + def set_load_types(self, spat_load: str = "map", temp_load: str = "mon"): + """ + Setting types for the loading processor. Can be set to update the types, so the same processor instance can be + applied afterward. + + Args: + spat_load (str): Can be "map" or "tot". Map is a long / lat map. + Total is an array with the totals summed over the spatial axis. + temp_load (str): Can be "mon" or "year". Decides if the loaded data + is monthly or yearly averaged. Can be extended later on. + """ + self.spat_load = spat_load + self.temp_load = temp_load + + def type_class_meta(self) -> str: + """Returns the name tag of the subclass.""" + return "input4mips" + + # TODO share this in process class with input4mips / cmip6 + def file_belongs_to_type(self, input_file: Path) -> bool: + """ + Check if the name input4mip(s) appears in the path name. + + Args: + input_file (Path): the file that should be checked for input4mips + Returns: + True if name indicates it's a cmip6 file, False if not. + """ + return bool(re.search("input4mip", str(input_file), re.IGNORECASE)) + + def process_directory( + self, + input_dir: Path, + cleaned_dir: Path, + processed_dir: Path, + load_dir: Path, + overwrite: bool = False, + silent: bool = False, + sum_sec_input_res: str = "50_km", + sum_sec_input_freq: str = "mon", + ): + """ + Applying all the stuff as decided in the init function. Most of the functions build upon each other, i.e. you + cannot apply store_totals if you haven't used merge_ghg before. To keep things modular the user can still decide + which ones to apply (e.g. because some things may have been applied earlier / are not necessary). Just be aware + that things break if you are not making sure that the order is followed. + + Args: + input_dir (Path): To which directory the processing should be applied. + cleaned_dir (Path): Where cleaned data should be stored. This is used + for the "preprocess" steps. + processed_dir (Path): Where processed data is stored. This is used + for sum_sectors. Simple preprocessing is directly applied on + raw data here. + load_dir (Path): Where data is stored that is strongly modified and + ready to be loaded. This is used for merge_ghg and store_totals. + overwrite (bool): If the data should be overwritten if it already + exists. Default False. + silent (bool): If this should be processed silently. + sum_sec_input_res: + sum_sec_input_freq: + """ + if any([process in self.processing_steps for process in [CORRECT_CALENDAR, CORRECT_TIME_AXIS, SUM_LEVELS]]): + self.processing_steps.append(_CDO_PROCESSING) + + process_list = [self.available_steps[step] for step in self.available_steps] + ordered_processes = sorted(process_list, key=lambda step: step[ORDER]) + + current_processing_directory = input_dir + for _, process in ordered_processes: + output_dir = process[PROCESSOR_STEP](current_processing_directory) + current_processing_directory = output_dir + + def _cdo_add_correct_calendar(self): + self.cdo_operators.append(f"{CDO_SET_CALENDAR},{self.calendar}") + + def _cdo_add_correct_time_axis(self): + # TODO add check for mon data only + self.cdo_operators.append(CDO_SET_DAY) + self.cdo_operators.append(CDO_SET_TIME) + + def _cdo_add_sum_levels(self): + self.cdo_operators.append(CDO_VERTICAL_SUM) + + def set_meta_data(self, meta_data: Path): + """Set meta data path if not done during initialization.""" + self.meta_data = meta_data + + +class RenameBiomassBurningFilesStep(AbstractProcessorStep): + def execute(self, input_directory): + """Renames files (biomassburning) if needed.""" + output_dir = rename_biomassburning_files_in_directory(input_directory) + + LOGGER.info(f"... Finished renaming biomassburning files in {input_directory}") + self.results_directory = output_dir + + +class ReorderSSPCO2DimensionsStep(AbstractProcessorStep): + def execute(self, input_directory): + output_dir = rename_biomassburning_files_in_directory(input_directory) + LOGGER.info(f"... Finished renaming biomassburning files in {input_directory}") + self.results_directory = output_dir + + +class CreateAnthroFireFilesStep(AbstractProcessorStep): + def __init__(self, metadata_directory, calendar, overwrite=False): + super().__init__() + self.metadata_directory = metadata_directory + self.overwrite = overwrite + + def execute(self, input_directory): + self.results_directory = create_anthro_fire_directory( + input_directory, meta_data=self.metadata_directory, overwrite=self.overwrite + ) + + +class CdoEmissionsProcessingStep(AbstractProcessorStep): + def __init__(self, cdo_operators, overwrite=False): + super().__init__() + self.cdo_operators = cdo_operators + self.overwrite = overwrite + + def execute(self, input_directory): + self.results_directory = cdo_process_directory( + input_directory=input_directory, cdo_operators=self.cdo_operators, overwrite=self.overwrite + ) + + +class CorrectUnitsStep(AbstractProcessorStep): + def __init__(self, desired_units): + super().__init__() + self.desired_units = desired_units + + def execute(self, input_directory): + """ + Correcting the ghg units (if possible). + + Attention, this module + is slow since it is not handled with cdo. Do only if necessary. + Args: + input_dir (Path): All files in that dir will be adapted + """ + output_dir = correct_units_all_files_in_directory(input_dir=input_directory, desired_units=self.desired_units) + self.results_directory = output_dir + + +class SumSectors(AbstractProcessorStep): + def __init__(self, working_dir, meta_data): + super().__init__() + self.meta_data = meta_data + self.working_dir = working_dir + + def execute(self, input_directory): + self.results_directory = sum_sectors_directory( + input_dir=input_directory, output_dir=self.working_dir, meta_data=self.meta_data + ) + + +class CreateEmissionsTotals(AbstractProcessorStep): + def __init__( + self, + working_dir, + spat: str = "map", + temp: str = "mon", + input_freq: str = "mon", + input_res: str = "50_km", + overwrite: bool = True, + ): + super().__init__() + self.working_dir = working_dir + self.spat = spat + self.temp = temp + self.input_freq = input_freq + self.input_res = input_res + self.overwrite = overwrite + + def execute(self, input_directory): + self.results_directory = create_emissions_totals( + input_dir=input_directory, + output_dir=self.working_dir, + spat=self.spat, + temp=self.temp, + input_freq=self.input_freq, + input_res=self.input_res, + overwrite=self.overwrite, + ) + + +class MergeGHG(AbstractProcessorStep): + def __init__(self, working_dir, ghg_list, ghg_sum_name, overwrite=False): + super().__init__() + self.working_dir = working_dir + self.ghg_list = ghg_list + self.ghg_sum_name = ghg_sum_name + self.overwrite = overwrite + + def execute(self, input_directory): + self.results_directory = merge_ghg_directory( + input_dir=input_directory, + output_dir=self.working_dir, + ghg_list=self.ghg_list, + ghg_sum_name=self.ghg_sum_name, + overwrite=self.overwrite, + ) diff --git a/climateset/processing/raw/input4mips/processing.py b/climateset/processing/raw/input4mips/processing.py new file mode 100644 index 0000000..b4975b1 --- /dev/null +++ b/climateset/processing/raw/input4mips/processing.py @@ -0,0 +1,901 @@ +import os +import re +import subprocess +from copy import copy +from pathlib import Path + +import pint +import xarray as xr +from tqdm import tqdm + +from climateset.processing.raw.utils import create_generic_output_path +from climateset.utils import create_logger + +LOGGER = create_logger(__name__) + +CDO_SET_CALENDAR = "-setcalendar" +CDO_SET_DAY = "-setday,1" +CDO_SET_TIME = "-settime,00:00:00" +CDO_VERTICAL_SUM = "-vertsum" + + +def rename_biomassburning_file(input_file: Path): + """ + Rename file for biomassburning in case it is in the naked version (from the downloader). Originally, the files are + just called 'GHG' instead of GHG_em_biomassburning. Overwrites the files directly. Attention, + + apply this only directly to the raw data!! The Input4mips preprocesser + renames variables later to the pure / naked GHG version - do not rename + them! + Args: + input_file (Path): File that should be renamed + """ + # fetch the ghg + ghg = str(input_file.parents[0]).split("/")[-4] + # check if this is the naked case + if "_" not in ghg: + new_ghg = f"{ghg}_em_biomassburning" + new_path_name = Path(str(input_file).replace(ghg, new_ghg)) + new_path_name.parent.mkdir(parents=True, exist_ok=True) + # use cdo to rename the var internally + commands = [ + "cdo", + "-s", + "-L", + f"-chname,{ghg},{ghg}_em_biomassburning", + f"-setattribute,variable_id={ghg}_em_biomassburning", + input_file, + new_path_name, + ] + subprocess.call(commands) + + # remove the old file + input_file.unlink() + + +def rename_biomassburning_files_in_directory(input_dir): + LOGGER.info(f"Starting to rename biomassburning files in {input_dir}.") + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + rename_biomassburning_file(Path(path) / file) + # remove empty subdirs + for p in input_dir.glob("**/*"): + if p.is_dir() and len(list(p.iterdir())) == 0: + os.removedirs(p) + return input_dir + + +def reorder_ssp_co2_file_dimensions(input_file: Path): + """ + Fix single co2 file. + + CDO can be applied again afterwards. + Memory issues might arise (core dumped). Not sure why. Runs on compute canada though. + + This function is essentially a wrapper around the following CDO command: + + cdo ncpdq --ovr -a time,level,lon,lat input.nc output.nc + + + Args: + input_file (Path): The co2 file that should be fixed. + """ + # ncpdq --ovr -a time,level,lon,lat input.nc output.nc + commands = ["ncpdq", "--ovr", "-a", "time,level,lon,lat", input_file, input_file] + subprocess.call(commands) + + +def reorder_all_ssp_co2_files_dimensions(input_dir: Path): + LOGGER.info(f"Starting to fix co2 files in {input_dir}.") + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + if ("ssp" in file) and ("CO2" in file): + file_path = Path(path) / file + reorder_ssp_co2_file_dimensions(file_path) + + LOGGER.info("...Finished fixing CO2 files.") + return input_dir + + +def create_anthro_fire_file( + input_file: Path, metadata_directory: Path, filetype: str, cdo_operators, output_file: Path, threads: int = 1 +): + """ + Create a new openburning or biomassburning file from a given file. + + Returns the path where the new file lifes. + Args: + input_file (Path): + type (str): either 'biomassburning' or 'openburning' + output_file (Path): Where to store the anthro fire file + threads (int): threads for cdo + """ + if metadata_directory is None: + LOGGER.info("You need to provide meta data for this function. Use set_meta_data func for that.") + file_parts = str(input_file.stem).split("_") + scenario, ghg, year = file_parts[1], file_parts[2], file_parts[-1] + meta_dir = metadata_directory / filetype + + # prepare additional operators (calendar and time-axis) for later compatability + chained_operators = copy(cdo_operators) + + if filetype == "openburning": + # find a file that contains openburning, scenario and ghg in the meta_data / "future-openburning" + meta_dir = metadata_directory / "future-openburning" + matched_files = [ + f for f in meta_dir.rglob("*.nc") if (scenario in str(f)) and (filetype in str(f)) and (ghg in str(f)) + ] + # take first file that matches that + raw_percentage_file = matched_files[0] + tmp_perc_file = input_file.parents[0] / f"tmp_percentage_{year}.nc" + + # 1. drop the years you don't need from the meta file. -selyear,2015 + # 2. drop the levels you dont need - sellevel,0,1 + # 3. sum the remaining levels / sectors -vertsum + # -> cdo -vertsum -sellevel,0,1 -selyear,year meta_input.nc 2015_share_output.nc + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + CDO_VERTICAL_SUM, + "-sellevel,0,1", # 0 is agriculture, 1 is deforestation + f"-selyear,{year}", + raw_percentage_file, + tmp_perc_file, + ] + subprocess.call(commands) + + # multiply -> cdo mul fire_file.nc percentage_file new_anthro_fire_file.nc + # [IMPORTANT first file must be input-file -> gets meta data from first input file!] + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + *chained_operators, + "-mul", + input_file, + tmp_perc_file, + output_file, + ] + subprocess.call(commands) + + # remove 2015_share_output file to save space (unlink) + tmp_perc_file.unlink() + + elif filetype == "biomassburning": + meta_dir = metadata_directory / "historic-biomassburning" + matched_files = [ + f for f in meta_dir.rglob("*.nc") if (filetype in str(f)) and (ghg in str(f)) and (year in str(f)) + ] + + # find the agri / defo files (written this way for later extension) + perc_dict = { + "AGRI": None, + "DEFO": None, + } + for f in matched_files: + for key in perc_dict: + if key in str(f): + perc_dict[key] = f + if (perc_dict["AGRI"] is None) or (perc_dict["DEFO"] is None): + LOGGER.warning("The AGRI or DEFO file does not exist, see dictionary: \n", stacklevel=2) + LOGGER.info(perc_dict) + LOGGER.info("... skipping this file.") + return + + # cdo 'expr,percentage_AGRI=percentage_AGRI/100' add percentage_AGRI_year.nc + # percentage_DEFO_year.nc percentage_ANTHRO_year.nc + tmp_anthro_perc_file = input_file.parents[0] / f"tmp_percentage_anthro_{year}.nc" + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + "expr,percentage_AGRI=percentage_AGRI/100;", + "-add", + perc_dict["AGRI"], + perc_dict["DEFO"], + tmp_anthro_perc_file, + ] + subprocess.call(commands) + + # cdo mul input_file.nc percentage_ANTHRO_year.nc output.nc + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + *chained_operators, + "-mul", + input_file, + tmp_anthro_perc_file, + output_file, + ] + subprocess.call(commands) + + # delete temp files + tmp_anthro_perc_file.unlink() + + # BIOMASSBURNING + # TODO get the right biomassburning files: year, GHG, biomassburning and AGRI + DEFO + # 1. cdo 'expr,percentage_AGRI=percentage_AGRI/100' -selyear,year meta_input.nc percentage_AGRI_year.nc + # 2. cdo 'expr,percentage_DEFO=percentage_DEFO/100' -selyear,year meta_input.nc percentage_DEFO_year.nc + # 3. cdo add percentage_AGRI_year.nc percentage_DEFO_year.nc percentage_ANTHRO_year.nc + # 4. cdo mul input_file.nc percentage_ANTHRO_year.nc output.nc + # 5 remove old files: all the percentage files + else: + raise ValueError("Type must be either 'openburing' or 'biomassburning'.") + + +# TODO fix output directory +def create_anthro_fire_directory( + input_directory: Path, meta_data: Path, cdo_operators, overwrite: bool = False +) -> Path: + """ + Creating anthropogenic fire files for a subdir. + + Args: + input_directory (Path): Input dir + """ + # loop through input dir, get biomassburning and openburning files + LOGGER.info(f"Starting to create anthropogenic fire files for files in {input_directory}.") + fire_output_dir = meta_data / "anthro-fire-data" + total_files = len(list(input_directory.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_directory), total=total_files): + if len(files) > 0: + for file in files: + input_file = Path(path) / file + # must match the right nominal and temporal resolution + if ("biomassburning" in file) and ("25_km" in file) and ("mon" in file): + filetype = "biomassburning" + elif ("openburning" in file) and ("50_km" in file) and ("mon" in file): + filetype = "openburning" + else: + continue + scenario = file.split("_")[1] + ghg = file.split("_")[2] + fire_name = file.replace("_em_", "_anthro_") + output_file = Path(fire_output_dir / filetype / scenario / ghg / fire_name) + output_file.parent.mkdir(parents=True, exist_ok=True) + if (not output_file.is_file()) or overwrite: + create_anthro_fire_file( + input_file=input_file, + metadata_directory=meta_data, + filetype=filetype, + cdo_operators=cdo_operators, + output_file=output_file, + ) + + LOGGER.info(f"... Finished creating anthro fire files and stored them in {fire_output_dir}") + return input_directory + + +def cdo_process_file(input_file: Path, output_file: Path, threads: int = 1, cdo_operators=None): + """Applies all the emission processing tasks defined during init on + a given netcdf file - but only those ones that can operate on a single + file. + Args: + input_file (Path): The file that should be processed. + output_file (Path): Where the resulting file is stored. + threads (int): threads for cdo + """ + # RENAME variables in case of biomassburning + # needs to be done beforehand becauses messes too much stuff up + # is done immediately in place + if "em_biomassburning" in str(input_file): + ds = xr.load_dataset(input_file) + var = ds.variable_id + old_name = str(input_file) + renamed_file = Path(old_name.replace(".nc", "_renamed.nc")) + if "em_biomassburning" not in var: + commands = [ + "cdo", + "-s", + "-L", + f"-chname,{var},{var}_em_biomassburning", + f"-setattribute,variable_id={var}_em_biomassburning", + input_file, + renamed_file, + ] + subprocess.call(commands) + # rename files & reassign the input_file + input_file.unlink() + input_file = renamed_file.rename(old_name) + + chained_operators = copy(cdo_operators) + # cdo vertsum infile outfile [CHECKED] + # AIR: sum over height levels + # anthro: sum over different sectors + + # ATTENTION only relevant for "monthly" data! + # cdo -setday,1 -settime,00:00:00 + # TODO add check for mon data only + if "mon" not in "": + chained_operators.remove(CDO_SET_DAY) + chained_operators.remove(CDO_SET_TIME) + LOGGER.warning("Tried to set the time axis on non-monthly file; removing related cdo operators for file.") + + # run commands + commands = ["cdo", "-s", "-w", "-L", "-P", str(threads), *chained_operators, input_file, output_file] + subprocess.call(commands) + + # unit_key_list = [ + # "units", "unit", + # "variable_units", "variable_unit", + # "var_units", "var_unit" + # ] + + # for k in unit_key_list: + # try: + # # TODO try to get the units + # pass + # # if successful: break loop + # except AttributeError: + # # TODO continue + # continue + # if unit is None: + # raise KeyError("Unit could not be found in file {}".format(input_file)) + # else: + # # TODO check if the unit is the one saved in the jason file + # calendar = self.meta_raw_dict["calendar"][var] + # pass + # simply raises an error in case we encounter units that we dont expect + + +# TODO fix output directory +def cdo_process_directory(input_directory, cdo_operators, output_directory=None, overwrite=False): + """ + Apply desired emission processing steps to a subdir and store it in output_dir. + + Can only be applied on raw input4mpis data, not on those + that have already been processed (i.e. merged / summed up). + Args: + input_directory (Path): Where the preprocessing should be applied + output_dir (Path): Where the results should be stored + overwrite (bool): If the data should be overwritten if it already exists + """ + # TODO if a year / file is missing -> write a warning! + # TODO LATER add feature to check lon / lat consistency for input4mips + # 1. apply the "preprocessing" - all the functions that can + # be applied to a single file + LOGGER.info(f"Start preprocessing of emission files {input_directory}....") + total_files = len(list(input_directory.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_directory), total=total_files): + if len(files) > 0: + for file in files: + # create output dir + input_file = Path(path) / file + # TODO fix filetype check + # if self.file_belongs_to_type(input_file): + output_file = create_generic_output_path(output_directory, path, file) + # skip if file already exists and we dont wanna overwrite it + if (not output_file.is_file()) or overwrite: + LOGGER.info(f"\nProcessing the following file: {input_file}") + cdo_process_file(input_file, output_file, cdo_operators=cdo_operators) + + LOGGER.info(f"...Finished preprocessing \nof {input_directory} and saved it at {output_directory}.") + return output_directory + + +def merge_ghg_files(input_files: list, output_file: Path, threads: int = 1): + """ + Merging GHG emission files together. + + Can only be applied after + 'create_totals_subdir'! + Args: + input_files (list): List of files (Path) that should be merged. + output_file (Path): Where the merged data should be stored + threads (int): How many threads should be used. + """ + if output_file.is_file(): + output_file.unlink() + + commands = ["cdo", "-w", "-s", "-L", "-P", str(threads), "-merge", *input_files, output_file] + + subprocess.call(commands) + + +def create_totals(input_files: list, output_path: Path, spat: str, temp: str, threads: int = 1): + """ + Creating totals for a given list of files for different spatial and temporal settings. + + Args: + input_files (list): List of paths that are used to create totals + output_path (Path): Where the merged totals should be stored. + spat (str): Can be "map" or "total". Map stores ghg emission maps + (longitude / latitude). Total sums the values over the spatial axes. + temp (str): Can be "mon" or "year". In case of year the data is summed + over years. Assumes that monthly data is used! + threads (int): number of threads + """ + # remove the file if it already exists + if output_path.is_file(): + output_path.unlink() + + # first basic command part + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + ] + + # add depending on cases + if spat == "total": + # commands.append("-fldsum") + # ATTTENTION this assumes that sum sectors has been applied before!! + ghg = str(input_files[0]).split("/")[-1].split("_")[2] + commands.append(f"expr,sum=fldsum({ghg});min=fldmin({ghg});max=fldmax({ghg});mean=fldmean({ghg});") + + if temp == "year": + commands.append("-yearsum") + + # add relevant commands for all cases + commands.extend(["-mergetime", *input_files, output_path]) + + subprocess.call(commands) + + +def merge_ghg_directory( + input_dir: Path, output_dir: Path, ghg_list: list, ghg_sum_name: str = "ALL", overwrite: bool = False +): + """ + Merging GHG emission files together. + + Can only be applied after + 'create_totals_subdir'! + Args: + input_dir (Path): Data dir that should be processed + output_dir (Path): Where the merged data should be stored + ghg_sum_name (str): How the dir and files where the merged data is + stored should be tagged. + overwrite (bool): If files should be overwritten when they already exist + """ + LOGGER.info(f"Start creating merged GHG emission files for {ghg_list} in {input_dir}...") + # create a dict with the ghg files that should be merged + file_dict = {} + # total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir)): + if len(files) > 0: + for file in files: + full_path = str(Path(path) / file) + # do this only for the ghg listed above: + for ghg in ghg_list: + if ghg in full_path: + ghg_file_str = full_path.replace(ghg, ghg_sum_name) + # add ghg group as key if they don't exist yet + if ghg_file_str not in file_dict: + file_dict[ghg_file_str] = [ghg] + else: + file_dict[ghg_file_str].append(ghg) + + # create the needed dirs + for file, ghgs in file_dict.items(): + # create output path + second_path_part = file.split("input4mips/")[-1] + out_path = Path(output_dir / "input4mips" / second_path_part) + out_path.parent.mkdir(parents=True, exist_ok=True) + + # create input files + input_files = [file.replace(ghg_sum_name, ghg) for ghg in ghgs] + + if (not out_path.is_file()) or overwrite: + merge_ghg_files( + input_files, + out_path, + ) + + LOGGER.info(f"... Finished merging GHG {ghg_list} in {output_dir}.") + return output_dir + + +def sum_sectors(input_files: list, sectors: list, output_path: Path, overwrite: bool = True, threads: str = 1): + """ + Args: + input_files (list): List of paths. The paths contain different + sectors and the information is summed up and stored in the + desired output_file. + sectors (list): List of strings. The different files that are stored + in input_files. The variable names of input_files. + output_path (Path): File where the resulting emissions are stored. + The output_path is adapted in case different fire cases are created. + overwrite (bool): True, i.e. if the file in output_path already + exists, it is overwritten. Set to False if you don't wanna overwrite + these files. + threads (str): Default 1. + + """ + # exit this if the file already exists and we dont wanna overwrite them + if output_path.is_file() and (not overwrite): + return + + ghg = sectors[0].split("_")[0] + # math sum expression + sum_expr = f"{ghg}={sectors[0]}" + for sec in sectors[1:]: + sum_expr = f"{sum_expr}+{sec}" + + # cdo expr,'var_new=var1+var2;' -merge infile1 infile2 outfile # works only on cmd + # cdo 'expr,var_new=var1+var2;' -merge infile1 infile2 outfile # works for both + commands = [ + "cdo", + "-w", + "-s", + "-L", + "-P", + str(threads), + f"expr,{sum_expr};", + "-merge", + *input_files, + output_path, + ] + subprocess.call(commands) + + +def correct_units_in_file(input_file: Path, desired_units: dict): + """ + Corrects units of a single file. + + Be aware this is slow if applied + to a large directory. Overwrites the old file + Args: + input_file (Path): The file whose units should be adapted. + """ + ureg = pint.UnitRegistry() + ds = xr.load_dataset(input_file) + + # only use the data variables (ignore bnds vars) + for var in ds.data_vars: + if "bnds" not in str(var): + # check the unit + found_unit = ds[var].units.replace("-", "^-") + desired_unit = desired_units[var].replace("-", "^-") + + # exit if it's the right one + if found_unit == desired_unit: + return + + try: + ureg(found_unit) + ureg(desired_unit) + except pint.errors.UndefinedUnitError: + LOGGER.info( + "Unit is not defined for pint. Check here: " + "https://github.com/hgrecco/pint/blob/master/pint/default_en.txt" + ) + return + # update ds + # this function works for any kind of unit transformations (if they can be transformed) + ds.update({var: xr.apply_ufunc(lambda x: ureg.Quantity(x, found_unit).to(desired_unit).magnitude, ds[var])}) + # update attrs + ds[var].attrs["units"] = desired_unit + # overwrite old file + ds.to_netcdf(input_file, mode="w", format="NETCDF4", engine="netcdf4") + + +def correct_units_all_files_in_directory(input_dir: Path, desired_units: dict): + LOGGER.info(f"Starting to correct all units (if needed) in {input_dir}.") + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + correct_units_in_file(Path(path) / file, desired_units) + + LOGGER.info(f"... Finished correcting the units (if needed) in {input_dir}") + return input_dir + + +def sum_sectors_fires(input_files: list, sectors: list, output_path: Path, meta_data: Path, overwrite: bool = True): + """ + Args: + input_files (list): List of paths. The paths contain different + sectors and the information is summed up and stored in the + desired output_file. + sectors (list): List of strings. The different files that are stored + in input_files. The variable names of input_files. + output_path (Path): File where the resulting emissions are stored. + The output_path is adapted in case different fire cases are created. + overwrite (bool): Default is True, i.e. data will be overwritten if it already exists + + """ + # check where (and if) biomassburning or openburning are contained + fire_idx = None + for i, var in enumerate(sectors): + if ("openburning" in var) or ("biomassburning" in var): + fire_idx = i + if fire_idx is None: + raise ValueError( + "Fire cases can only be created if openburning or biomassburning files are given. " + "(Must be contained in the file name as well)." + ) + + # Get ghg and fire type + ghg = sectors[0].split("_")[0] + # fire_type = sectors[fire_idx].split("_")[-1] + + # CASE 1: ALL FIRES + all_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_all-fires_")) + all_fire_files = input_files + all_fire_sectors = sectors + sum_sectors(all_fire_files, all_fire_sectors, all_fire_output_path, overwrite=overwrite) + + # CASE 2: ANTHRO FIRES + anthro_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_anthro-fires_")) + anthro_fire_sectors = sectors + # get the right anthro fire file from meta + # 1. get the current fire file + all_fire_file = input_files[fire_idx] + anthro_file_name = str(all_fire_file.name).replace("_em_", "_anthro_") + potential_anthro_files = [f for f in meta_data.rglob(anthro_file_name)] + if len(potential_anthro_files) < 1: + LOGGER.info(f"No anthro fire file found for {all_fire_file.name}.") + pass + else: + if len(potential_anthro_files) > 1: + LOGGER.info(f"Several fire files found for {all_fire_file.name}. Choosing only the first one.") + anthro_fire_files = input_files.copy() + anthro_fire_files[fire_idx] = potential_anthro_files[0] + sum_sectors(anthro_fire_files, anthro_fire_sectors, anthro_fire_output_path, overwrite=overwrite) + + # CASE 3: NO FIRES + # drop the openburning or biomassburning sectors and files + no_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_no-fires_")) + no_fire_files = input_files.copy() + no_fire_sectors = sectors.copy() + no_fire_files.pop(fire_idx) + no_fire_sectors.pop(fire_idx) + sum_sectors(no_fire_files, no_fire_sectors, no_fire_output_path, overwrite=overwrite) + + +def sum_sector_cases( + file_dict: dict, + output_dir: Path, + meta_data: Path, + fire_cases: bool, + force_sum_sectors: list = None, + overwrite: bool = False, + silent: bool = False, +): + """ + Calls sum sector for different cases. + + Args: + file_dict (dict): "example_path_SECTOR_sth": [file_path, file_path, file_path] + output_dir (Path): output directory, where to save the diff cases. + fire_cases (bool): if the diff fire cases should be considered + force_sum_sectors (str): Default is empty, i.e. no merge is forced. + Put the name of GHG here that should be forced to merge. + overwrite (bool): if the files should be overwritten if they already exist + silent (bool): If this should be processed silently. + """ + if not force_sum_sectors: + force_sum_sectors = [] + + for file, sectors in tqdm(file_dict.items()): + num_nones = sum([sec is None for sec in sectors]) + + # create file lists and names for the case where everything is added + ghg = file.split("input4mips/")[-1].split("/")[1].split("_")[0] + input_files = [Path(file.replace("SECTOR", sec)) for sec in sectors if sec is not None] + sector_list = [f"{ghg}_{sec}" for sec in sectors if sec is not None] + out_path = Path(output_dir / "input4mips" / file.split("input4mips/")[-1].replace("SECTOR", "sum")) + out_path.parent.mkdir(parents=True, exist_ok=True) + + if num_nones == 0: + # normal case + if fire_cases: + sum_sectors_fires(input_files, sector_list, out_path, meta_data=meta_data, overwrite=overwrite) + else: + sum_sectors(input_files, sector_list, out_path, overwrite=overwrite) + + elif (num_nones == 1) and (ghg in force_sum_sectors): + # just the simple summing if desired + warning + if not silent: + LOGGER.warning(f"Merged files for {file}, however some sectors are missing", stacklevel=2) + sum_sectors(input_files, sector_list, out_path, overwrite=overwrite) + + elif (num_nones == 1) and ghg not in force_sum_sectors: + if not silent: + LOGGER.warning( + f"Skipping case {file}. Not all three sectors are available. " + f"You can force merging with 'force_sum_sectors=[GHG1, GHG2]'." + ) + + elif num_nones == 2: + # warning, skipping + if not silent: + LOGGER.warning( + f"Skipping case {file}. At least two files must exist to be summed together.", + stacklevel=2, + ) + else: + raise RuntimeError("'num_nones' must be between 0 and 3.") + + +# TODO fix output dir and overwrite +# TODO break this into two functions +def sum_sectors_directory( + input_dir: Path, + output_dir: Path, + meta_data: Path, + force_sum_sectors: list = None, + fire_cases: bool = False, + overwrite: bool = True, + silent: bool = False, + input_res: str = "50_km", + input_freq: str = "mon", +): + """ + Args: + force_sum_sectors (str): Default is empty, i.e. no merge is forced. + Put the name of GHG here that should be forced to merge. + fire_cases (bool): Default False means that simply all the data is + summed up and that's it. In case this is set to true, three different + things are calculated: anthro-fires, no-fires, all-fires. Can + only be done if "em_openburning" or "em_biomassburning" files + are included. + overwrite (bool): if the files should be overwritten if they already exist + silent (bool): If this should be processed silently. + """ + LOGGER.info(f"Start summing up different types of emission files in {input_dir}...") + if not force_sum_sectors: + force_sum_sectors = [] + # params of the files to sum over + # TODO make these params in a config + # TODO find out which files are skipped + # res = "50_km" + # freq = "mon" + # years = [] + # dicts needed to match files together + # "example_path_SECTOR_sth": [None, None, None] + ssp_file_dict = {} + historical_file_dict = {} + sector_idx = {"em_AIR_anthro": 0, "em_anthro": 1, "em_openburning": 2, "em_biomassburning": 2} + + # match all files that have the exact same name and end on + total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): + if len(files) > 0: + for file in files: + full_path = str(Path(path) / file) + # only process the files with the right res and frequency + if (input_res in full_path) and (input_freq in full_path): + # sector descriptions + # check if this is historical or ssp and match + if "ssp" in file: + sector = re.search(r"(em_AIR_anthro|em_anthro|em_openburning)", full_path).group(0) + sector_file_str = full_path.replace(sector, "SECTOR") + # add ghg group as key if they don't exist yet + if sector_file_str not in ssp_file_dict: + ssp_file_dict[sector_file_str] = [None, None, None] + # add the found sector at the right sector index + # ssp_file_dict[sector_file_str][sector_idx[sector]] = full_path + ssp_file_dict[sector_file_str][sector_idx[sector]] = sector + + elif "historical" in file: + sector = re.search(r"(em_AIR_anthro|em_anthro|em_biomassburning)", full_path).group(0) + sector_file_str = full_path.replace(sector, "SECTOR") + # add ghg group as key if they don't exist yet + if sector_file_str not in historical_file_dict: + historical_file_dict[sector_file_str] = [None, None, None] + # add the sector at the right sector index + # history_file_dict[sector_file_str][sector_idx[sector]] = full_path + historical_file_dict[sector_file_str][sector_idx[sector]] = sector + + else: + raise ValueError( + "Input4mips files must include the scenario ('ssp' or 'historical') in their name." + ) + + sum_sector_cases( + file_dict=ssp_file_dict, + output_dir=output_dir, + fire_cases=fire_cases, + meta_data=meta_data, + force_sum_sectors=force_sum_sectors, + overwrite=False, + silent=silent, + ) + + sum_sector_cases( + file_dict=historical_file_dict, + output_dir=output_dir, + fire_cases=fire_cases, + meta_data=meta_data, + force_sum_sectors=force_sum_sectors, + overwrite=False, + silent=silent, + ) + + LOGGER.info(f"...Finished summing up ghg emissions and stored results in {output_dir}") + return output_dir + + +def create_emissions_totals( + input_dir: Path, + output_dir: Path, + spat: str = "map", + temp: str = "mon", + input_freq: str = "mon", + input_res: str = "50_km", + overwrite: bool = True, +): + """ + Creating the data that can actually be loaded by pytorch. + + Args: + input_dir (Path): Data dir that should be processed + output_dir (Path): Where the merged data should be stored + spat (str): Can be "map" or "total". Map stores ghg emission maps + (longitude / latitude). Total sums the values over the spatial axes. + temp (str): Can be "mon" or "year". In case of year the data is summed + over years. Assumes that monthly data is used! + input_freq (str): The frequency of the input data (can only process) + one type after another, so this must be fixed. Default "mon". + input_res (str): The nominal resolution of the input data (can only process) + one type after another, so this must be fixed. Default "50_km". + overwrite (bool): If files should be overwritten when they already exist + """ + LOGGER.info(f"Start creating loadable emission data {input_dir}....") + if spat not in ["map", "total"]: + raise ValueError("Parameter spat must be either 'map' or 'total'.") + if temp not in ["mon", "year"]: + raise ValueError("Parameter temp must be either 'mon' or 'year'.") + + # Match all the files that belong together and contain mon (only difference is the year ending!) + # TODO move this function into an external one + file_dict = {} + # total_files = len(list(input_dir.rglob("*.nc"))) + for path, subdirs, files in tqdm(os.walk(input_dir)): + if len(files) > 0: + for file in files: + full_path = str(Path(path) / file) + if (input_res in full_path) and (input_freq in full_path): + year = int(file.split(".")[0].split("_")[-1]) + year_file_str = full_path.replace(str(year), "YEAR") + # add ghg group as key if they don't exist yet + if year_file_str not in file_dict: + file_dict[year_file_str] = [year] + else: + file_dict[year_file_str].append(year) + + # create the needed dirs + for file, years in file_dict.items(): + # create output path + second_path_part = file.split("input4mips/")[-1] + second_path_part = second_path_part.replace(input_res, f"{spat}_{input_res}") + second_path_part = second_path_part.replace(input_freq, f"{temp}") + second_path_part = second_path_part.replace("YEAR", f"{min(years)}-{max(years)}") + out_path = Path(output_dir / "input4mips" / second_path_part) + out_path.parent.mkdir(parents=True, exist_ok=True) + + # create input files + input_files = [file.replace("YEAR", str(y)) for y in years] + + if (not out_path.is_file()) or overwrite: + create_totals( + input_files, + out_path, + spat, + temp, + ) + + LOGGER.info(f"... Finished creating loadable emission data {output_dir}....") + return output_dir diff --git a/climateset/processing/raw/input4mips_processing.py b/climateset/processing/raw/input4mips_processing.py deleted file mode 100644 index 9a1159f..0000000 --- a/climateset/processing/raw/input4mips_processing.py +++ /dev/null @@ -1,1173 +0,0 @@ -import os -import re -import subprocess -import warnings -from copy import copy -from pathlib import Path -from typing import Union - -import pint -import xarray as xr -from tqdm import tqdm - -from climateset.processing.raw.abstract_raw_processing import AbstractRawProcesser -from climateset.processing.raw.checker import AbstractDirectoryChecker -from climateset.processing.raw.utils import create_generic_output_path -from climateset.utils import create_logger - -# -# Processing steps -# - -# private step -_CDO_PROCESSING = "_cdo_processing" - -# user available steps -CORRECT_NAMES = "correct_names" -FIX_CO2 = "fix_co2" -CREATE_FIRE_FILES = "create_fire_files" -CORRECT_UNITS = "correct_units" -CORRECT_CALENDAR = "correct_calendar" -CORRECT_TIME_AXIS = "correct_time_axis" -SUM_LEVELS = "sum_levels" -SUM_SECTORS = "sum_sectors" -CREATE_TOTALS = "create_totals" -MERGE_GHG = "merge_ghg" - -AVAILABLE_INPUT4MIPS_PROCESSING_STEPS = frozenset( - [ - CORRECT_NAMES, - FIX_CO2, - CREATE_FIRE_FILES, - CORRECT_UNITS, - CORRECT_CALENDAR, - CORRECT_TIME_AXIS, - SUM_LEVELS, - SUM_SECTORS, - CREATE_TOTALS, - MERGE_GHG, - ] -) - -SET_DAY = "-setday,1" -SET_TIME = "-settime,00:00:00" - -LOGGER = create_logger(__name__) - - -class Input4MipsEmissionProcesser(AbstractRawProcesser): - """ - Can be called to summarize emission over sectors. - - Only applied to Input4MIPs data. - """ - - def __init__( - self, - input_directory: Union[str, Path], - working_directory: Union[str, Path], - checker: AbstractDirectoryChecker = None, - processing_steps=None, - spat_load: str = "map", - temp_load: str = "mon", - ghg_list: list = None, - meta_data: Path = None, - fire_cases: bool = True, - force_sum_sectors: list = None, - input_freq: str = "mon", - input_res: str = "50_km", - ): - """Init emission / ghg processer - Args: - verify_units (bool): Makes sure that all ghg conform to the same units. - Default false because slow. - verify_resolution (bool): Checks if the res in the file name is the same like in the file. - Default false because slow. - verify_lonlat (bool): Checks if the longitude, latitude values are as expected. - Default false because slow. - verify_num_of_files (bool): Checks how many files are in the final dir. - raises a warning if more or less than one. - verify_variables (bool): Checks if the variables are also the expected vars. - fix_co2 (bool): Fixes co2 data if necessary. Needed for ssp scenarios! - Default false, because we are providing raw data where it's already fixed. - correct_names (bool): renames all files that are expected - to be biomassburning files. Attention! apply only to raw data, - will mess things up if applied later! - correct_units (bool): Default False, because it takes long. Corrects - the units according to meta data saved in the raw param file. - correct_calendar (bool): Makes sure that the right calendar is used. - correct_time_axis (bool): Makes sure that each file starts at the first - of each time-unit and shifts if not. - sum_levels (bool): Makes sure that all emissions are sumed over the - different levels and sectors. For AIR files, the emissions are - summed over 25 different height levels. For ANTHRO files, the - emissions are summed over 8 different sectors. Openburning and - biomassburning have only one level (sectors are diff in external - percentage files, see 'fire_cases'). - sum_sectors (bool): Makes sure that ghg are summarized over diff sectors. - Historical data is treated differently. Different versions - are stored in the output_dir for historical data, since different - climate models need different versions here. There is a param/config file - mapping the right climate models to the right sum_sector types. - create_totals (bool): For some tasks you might not want to use - emission files, but arrays of totals over a specific time frame. - With this you can creat either map or total data; monthly or yearly. - See spat_load and temp_load for that. - spat_load (str): Can be "map" or "tot". Map is a long / lat map. - Total is an array with the totals summed over the spatial axis. - temp_load (str): Can be "mon" or "year". Decides if the loaded data - is monthly or yearly averaged. Can be extended later on. - merge_ghg (list or bool): If True: - Simply merges all ghg that can be found in the sub_dir (e.g.). - Instead one can hand over a list, which means that those ghg - specifically will be merged into a new file. - ghg_list (list): The greenhouse gases (ghg) that should be merged in - merge_ghg. Should be the folder names used in the respective dir. - meta_data (Path): Default None. Only necessary when sum_sectors is - used and anthro fire files should be generated. - fire_cases (bool): If the different fire cases should be considered - during sum_sectors. - force_sum_sectors (list): Which GHG are forced to be summed (sum_sectors) even if not all - sectors (AIR, anthro, biomassburning/openburning) is available. - Default is ["CO2"], since CO2 is the only GHG that does not have - biomassburning/openburning files as the rest does. - input_freq (str): Default "mon". Only used for create_totals_subdir. - input_res (str): Default "50 km". Only used for create_totals_subdir. - create_fire_files (bool): Default false, because it only needs to be - created one time. - """ - # init abstract class with checks - super().__init__( - input_directory=input_directory, - working_directory=working_directory, - checker=checker, - processing_steps=processing_steps, - ) - - self.available_steps = { - CORRECT_NAMES: {"order": 1, "function": self.correct_names_subdir}, - FIX_CO2: {"order": 2, "function": self.fix_co2_subdir}, - CREATE_FIRE_FILES: {"order": 3, "function": self.create_anthro_fire_subdir}, - CORRECT_UNITS: {"order": 4, "function": self.correct_units_subdir}, - CORRECT_CALENDAR: {"order": 5, "function": self._correct_calendar}, - CORRECT_TIME_AXIS: {"order": 6, "function": self._correct_time_axis}, - SUM_LEVELS: {"order": 7, "function": self._sum_levels}, - _CDO_PROCESSING: {"order": 8, "function": self.cdo_preprocess_subdir}, - SUM_SECTORS: {"order": 9, "function": self.sum_sectors_subdir}, - CREATE_TOTALS: {"order": 10, "function": self.create_totals_subdir}, - MERGE_GHG: {"order": 11, "function": self.merge_ghg_subdir}, - } - - self.cdo_operators = [] - self.spat_load = spat_load - self.temp_load = temp_load - - # ghg list that should be merged - self.ghg_list = ghg_list - if not self.ghg_list: - self.ghg_list = ["BC_sum", "CH4_sum", "SO2_sum"] - self.meta_data = meta_data - self.fire_cases = fire_cases - self.force_sum_sectors = force_sum_sectors - if not self.force_sum_sectors: - self.force_sum_sectors = ["CO2"] - self.input_freq = input_freq - self.input_res = input_res - - # TODO CO2 baseline - where?? - # CO2 baseline: do this outside, because it means that the data looks - # different for each climate model. best case you just store that somewhere - # in a config, store that well and apply it in the dataloader for each model separatetly? - # self.substract_co2_baseline = substract_co2_baseline - - def add_correct_names_step(self): - self.processing_steps.append(CORRECT_NAMES) - - def add_fix_co2_step(self): - self.processing_steps.append(FIX_CO2) - - def add_create_fire_files(self): - self.processing_steps.append(CREATE_FIRE_FILES) - - def add_correct_units_step(self): - self.processing_steps.append(CORRECT_UNITS) - - def add_correct_calendar_step(self): - self.processing_steps.append(CORRECT_CALENDAR) - - def add_correct_time_axis_step(self): - self.processing_steps.append(CORRECT_TIME_AXIS) - - def add_sum_levels_step(self): - self.processing_steps.append(SUM_LEVELS) - - def add_sum_sectors_step(self): - self.processing_steps.append(SUM_SECTORS) - - def add_create_totals_step(self): - self.processing_steps.append(CREATE_TOTALS) - - def add_merge_ghg_step(self): - self.processing_steps.append(MERGE_GHG) - - def set_load_types(self, spat_load: str = "map", temp_load: str = "mon"): - """ - Setting types for the loading processer. Can be set to update the types, so the same processer instance can be - applied afterwards. - - Args: - spat_load (str): Can be "map" or "tot". Map is a long / lat map. - Total is an array with the totals summed over the spatial axis. - temp_load (str): Can be "mon" or "year". Decides if the loaded data - is monthly or yearly averaged. Can be extended later on. - """ - self.spat_load = spat_load - self.temp_load = temp_load - - def type_class_meta(self) -> str: - """Returns the name tag of the subclass.""" - return "input4mips" - - # TODO share this in process class with input4mips / cmip6 - def file_belongs_to_type(self, input_file: Path) -> bool: - """ - Check if the name input4mip(s) appears in the path name. - - Args: - input_file (Path): the file that should be checked for input4mips - Returns: - True if name indicates it's a cmip6 file, False if not. - """ - return bool(re.search("input4mip", str(input_file), re.IGNORECASE)) - - def preprocess_subdir( - self, - input_dir: Path, - cleaned_dir: Path, - processed_dir: Path, - load_dir: Path, - overwrite: bool = False, - silent: bool = False, - sum_sec_input_res: str = "50_km", - sum_sec_input_freq: str = "mon", - ): - """ - Applying all the stuff as decided in the init function. Most of the functions build upon each other, i.e. you - cannot apply store_totals if you haven't used merge_ghg before. To keep things modular the user can still decide - which ones to apply (e.g. because some things may have been applied earlier / are not necessary). Just be aware - that things break if you are not making sure that the order is followed. - - Args: - input_dir (Path): To which directory the processing should be applied. - cleaned_dir (Path): Where cleaned data should be stored. This is used - for the "preprocess" steps. - processed_dir (Path): Where processed data is stored. This is used - for sum_sectors. Simple preprocessing is directly applied on - raw data here. - load_dir (Path): Where data is stored that is strongly modified and - ready to be loaded. This is used for merge_ghg and store_totals. - overwrite (bool): If the data should be overwritten if it already - exists. Default False. - silent (bool): If this should be processed silently. - sum_sec_input_res: - sum_sec_input_freq: - """ - if any([process in self.processing_steps for process in [CORRECT_CALENDAR, CORRECT_TIME_AXIS, SUM_LEVELS]]): - self.processing_steps.append(_CDO_PROCESSING) - - process_list = [self.available_steps[step] for step in self.available_steps] - ordered_processes = sorted(process_list, key=lambda step: step["order"]) - - for _, process in ordered_processes: - # process["function"]() - print(process) - - def fix_co2_subdir(self, input_dir: Path): - """ - Fix CO2 files. - - Args: - input_dir (Path): All ssp CO2 files in that dir will be adapted - """ - print(f"Starting to fix co2 files in {input_dir}.") - total_files = len(list(input_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): - if len(files) > 0: - for file in files: - if ("ssp" in file) and ("CO2" in file): - self._fix_co2(Path(path) / file) - - print("...Finished fixing CO2 files.") - - def _fix_co2(self, input_file: Path): - """ - Fix single co2 file. - - CDO can be applied again afterwards. - Memory issues might arise (core dumped). Not sure why. Runs on compute canada though. - Args: - input_file (Path): The co2 file that should be fixed. - """ - # ncpdq --ovr -a time,level,lon,lat input.nc output.nc - commands = ["ncpdq", "--ovr", "-a", "time,level,lon,lat", input_file, input_file] - subprocess.call(commands) - - def correct_names_subdir(self, input_dir: Path): - """ - Renames files (biomassburning) if needed. - - Args: - input_dir (Path): All files in that dir will be adapted - """ - print(f"Starting to rename biomassburning files in {input_dir}.") - total_files = len(list(input_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): - if len(files) > 0: - for file in files: - self._rename_biomassburning_files(Path(path) / file) - - # remove empty subdirs - for p in input_dir.glob("**/*"): - if p.is_dir() and len(list(p.iterdir())) == 0: - os.removedirs(p) - - print(f"... Finished renaming biomassburning files in {input_dir}") - - # TODO test - def correct_units_subdir(self, input_dir: Path): - """ - Correcting the ghg units (if possible). - - Attention, this module - is slow since it is not handled with cdo. Do only if necessary. - Args: - input_dir (Path): All files in that dir will be adapted - """ - print(f"Starting to correct all units (if needed) in {input_dir}.") - total_files = len(list(input_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): - if len(files) > 0: - for file in files: - self._correct_units(Path(path) / file) - - print(f"... Finished correcting the units (if needed) in {input_dir}") - - def _rename_biomassburning_files(self, input_file: Path): - """ - Rename file for biomassburning in case it is in the naked version (from the downloader). Originally, the files - are just called 'GHG' instead of GHG_em_biomassburning. Overwrites the files directly. Attention, - - apply this only directly to the raw data!! The Input4mips preprocesser - renames variables later to the pure / naked GHG version - do not rename - them! - Args: - input_file (Path): File that should be renamed - """ - # fetch the ghg - ghg = str(input_file.parents[0]).split("/")[-4] - # check if this is the naked case - if "_" not in ghg: - new_ghg = f"{ghg}_em_biomassburning" - new_path_name = Path(str(input_file).replace(ghg, new_ghg)) - new_path_name.parent.mkdir(parents=True, exist_ok=True) - # use cdo to rename the var internally - commands = [ - "cdo", - "-s", - "-L", - f"-chname,{ghg},{ghg}_em_biomassburning", - f"-setattribute,variable_id={ghg}_em_biomassburning", - input_file, - new_path_name, - ] - subprocess.call(commands) - - # remove the old file - input_file.unlink() - - # TODO test - def _correct_units(self, input_file: Path): - """ - Corrects units of a single file. - - Be aware this is slow if applied - to a large directory. Overwrites the old file - Args: - input_file (Path): The file whose units should be adapted. - """ - ureg = pint.UnitRegistry() - ds = xr.load_dataset(input_file) - - # only use the data variables (ignore bnds vars) - for var in ds.data_vars: - if "bnds" not in str(var): - # check the unit - found_unit = ds[var].units.replace("-", "^-") - desired_unit = self.desired_units[var].replace("-", "^-") - - # exit if it's the right one - if found_unit == desired_unit: - return - - else: - try: - ureg(found_unit) - ureg(desired_unit) - except pint.errors.UndefinedUnitError: - print( - "Unit is not defined for pint. Check here: https://github.com/hgrecco/pint/blob/master/pint/default_en.txt" - ) - return - # update ds - # this function works for any kind of unit transformations (if they can be transformed) - ds.update( - { - var: xr.apply_ufunc( - lambda x: ureg.Quantity(x, found_unit).to(desired_unit).magnitude, ds[var] - ) - } - ) - # update attrs - ds[var].attrs["units"] = desired_unit - # overwrite old file - ds.to_netcdf(input_file, mode="w", format="NETCDF4", engine="netcdf4") - - def merge_ghg_subdir( - self, input_dir: Path, output_dir: Path, ghg_list: list, ghg_sum_name: str = "ALL", overwrite: bool = False - ): - """ - Merging GHG emission files together. - - Can only be applied after - 'create_totals_subdir'! - Args: - input_dir (Path): Data dir that should be processed - output_dir (Path): Where the merged data should be stored - ghg_sum_name (str): How the dir and files where the merged data is - stored should be tagged. - overwrite (bool): If files should be overwritten when they already exist - """ - print(f"Start creating merged GHG emission files for {ghg_list} in {input_dir}...") - # create a dict with the ghg files that should be merged - file_dict = {} - # total_files = len(list(input_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(input_dir)): - if len(files) > 0: - for file in files: - full_path = str(Path(path) / file) - # do this only for the ghg listed above: - for ghg in ghg_list: - if ghg in full_path: - ghg_file_str = full_path.replace(ghg, ghg_sum_name) - # add ghg group as key if they don't exist yet - if not (ghg_file_str in file_dict): - file_dict[ghg_file_str] = [ghg] - else: - file_dict[ghg_file_str].append(ghg) - - # create the needed dirs - for file, ghgs in file_dict.items(): - # create output path - second_path_part = file.split("input4mips/")[-1] - out_path = Path(output_dir / "input4mips" / second_path_part) - out_path.parent.mkdir(parents=True, exist_ok=True) - - # create input files - input_files = [file.replace(ghg_sum_name, ghg) for ghg in ghgs] - - if (not out_path.is_file()) or overwrite: - self._merge_ghg_files( - input_files, - out_path, - ) - - print(f"... Finished merging GHG {ghg_list} in {output_dir}.") - - def _merge_ghg_files(self, input_files: list, output_file: Path, threads: int = 1): - """ - Merging GHG emission files together. - - Can only be applied after - 'create_totals_subdir'! - Args: - input_files (list): List of files (Path) that should be merged. - output_file (Path): Where the merged data should be stored - threads (int): How many threads should be used. - """ - if output_file.is_file(): - output_file.unlink() - - commands = ["cdo", "-w", "-s", "-L", "-P", str(threads), "-merge", *input_files, output_file] - - subprocess.call(commands) - - def create_totals_subdir( - self, - input_dir: Path, - output_dir: Path, - spat: str = "map", - temp: str = "mon", - input_freq: str = "mon", - input_res: str = "50_km", - overwrite: bool = True, - ): - """ - Creating the data that can actually be loaded by pytorch. - - Args: - input_dir (Path): Data dir that should be processed - output_dir (Path): Where the merged data should be stored - spat (str): Can be "map" or "total". Map stores ghg emission maps - (longitude / latitude). Total sums the values over the spatial axes. - temp (str): Can be "mon" or "year". In case of year the data is summed - over years. Assumes that monthly data is used! - input_freq (str): The frequency of the input data (can only process) - one type after another, so this must be fixed. Default "mon". - input_res (str): The nominal resolution of the input data (can only process) - one type after another, so this must be fixed. Default "50_km". - overwrite (bool): If files should be overwritten when they already exist - """ - print(f"Start creating loadable emission data {input_dir}....") - if spat not in ["map", "total"]: - raise ValueError("Parameter spat must be either 'map' or 'total'.") - if temp not in ["mon", "year"]: - raise ValueError("Parameter temp must be either 'mon' or 'year'.") - - # Match all the files that belong together and contain mon (only difference is the year ending!) - # TODO move this function into an external one - file_dict = {} - # total_files = len(list(input_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(input_dir)): - if len(files) > 0: - for file in files: - full_path = str(Path(path) / file) - if (input_res in full_path) and (input_freq in full_path): - year = int(file.split(".")[0].split("_")[-1]) - year_file_str = full_path.replace(str(year), "YEAR") - # add ghg group as key if they don't exist yet - if year_file_str not in file_dict: - file_dict[year_file_str] = [year] - else: - file_dict[year_file_str].append(year) - - # create the needed dirs - for file, years in file_dict.items(): - # create output path - second_path_part = file.split("input4mips/")[-1] - second_path_part = second_path_part.replace(input_res, f"{spat}_{input_res}") - second_path_part = second_path_part.replace(input_freq, f"{temp}") - second_path_part = second_path_part.replace("YEAR", f"{min(years)}-{max(years)}") - out_path = Path(output_dir / "input4mips" / second_path_part) - out_path.parent.mkdir(parents=True, exist_ok=True) - - # create input files - input_files = [file.replace("YEAR", str(y)) for y in years] - - if (not out_path.is_file()) or overwrite: - self._create_totals( - input_files, - out_path, - spat, - temp, - ) - - print(f"... Finished creating loadable emission data {output_dir}....") - - def _create_totals(self, input_files: list, output_path: Path, spat: str, temp: str, threads: int = 1): - """ - Creating totals for a given list of files for different spatial and temporal settings. - - Args: - input_files (list): List of paths that are used to create totals - output_path (Path): Where the merged totals should be stored. - spat (str): Can be "map" or "total". Map stores ghg emission maps - (longitude / latitude). Total sums the values over the spatial axes. - temp (str): Can be "mon" or "year". In case of year the data is summed - over years. Assumes that monthly data is used! - threads (int): number of threads - """ - # remove the file if it already exists - if output_path.is_file(): - output_path.unlink() - - # first basic command part - commands = [ - "cdo", - "-w", - "-s", - "-L", - "-P", - str(threads), - ] - - # add depending on cases - if spat == "total": - # commands.append("-fldsum") - # ATTTENTION this assumes that sum sectors has been applied before!! - ghg = str(input_files[0]).split("/")[-1].split("_")[2] - commands.append("expr,sum=fldsum({0});min=fldmin({0});max=fldmax({0});mean=fldmean({0});".format(ghg)) - - if temp == "year": - commands.append("-yearsum") - - # add relevant commands for all cases - commands.extend(["-mergetime", *input_files, output_path]) - - subprocess.call(commands) - - def _correct_calendar(self): - self.cdo_operators.append(f"-setcalendar,{self.calendar}") - - def _correct_time_axis(self): - # TODO add check for mon data only - self.cdo_operators.append(SET_DAY) - self.cdo_operators.append(SET_TIME) - - def _sum_levels(self): - self.cdo_operators.append("-vertsum") - - # TODO if a year / file is missing -> write a warning! - # TODO LATER add feature to check lon / lat consistency for input4mips - def cdo_preprocess_subdir(self, sub_dir: Path, output_dir: Path, overwrite: bool = False): - """ - Apply desired emission processing steps to a subdir and store it in output_dir. - - Can only be applied on raw input4mpis data, not on those - that have already been processed (i.e. merged / summed up). - Args: - sub_dir (Path): Where the preprocessing should be applied - output_dir (Path): Where the results should be stored - overwrite (bool): If the data should be overwritten if it already exists - """ - # 1. apply the "preprocessing" - all the functions that can - # be applied to a single file - print(f"Start preprocessing of emission files {sub_dir}....") - total_files = len(list(sub_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(sub_dir), total=total_files): - if len(files) > 0: - for file in files: - # create output dir - input_file = Path(path) / file - if self.file_belongs_to_type(input_file): - output_file = create_generic_output_path(output_dir, path, file) - # skip if file already exists and we dont wanna overwrite it - if (not output_file.is_file()) or overwrite: - print(f"\nProcessing the following file: {input_file}") - self._cdo_preprocess(input_file, output_file) - - print(f"...Finished preprocessing \nof {sub_dir} and saved it at {output_dir}.") - - def _cdo_preprocess(self, input_file: Path, output_file: Path, threads: int = 1, cdo_operators=None): - """Applies all the emission processing tasks defined during init on - a given netcdf file - but only those ones that can operate on a single - file. - Args: - input_file (Path): The file that should be processed. - output_file (Path): Where the resulting file is stored. - threads (int): threads for cdo - """ - # RENAME variables in case of biomassburning - # needs to be done beforehand becauses messes too much stuff up - # is done immediately in place - if "em_biomassburning" in str(input_file): - ds = xr.load_dataset(input_file) - var = ds.variable_id - old_name = str(input_file) - renamed_file = Path(old_name.replace(".nc", "_renamed.nc")) - if "em_biomassburning" not in var: - commands = [ - "cdo", - "-s", - "-L", - f"-chname,{var},{var}_em_biomassburning", - f"-setattribute,variable_id={var}_em_biomassburning", - input_file, - renamed_file, - ] - subprocess.call(commands) - # rename files & reassign the input_file - input_file.unlink() - input_file = renamed_file.rename(old_name) - - chained_operators = copy(self.cdo_operators) - # cdo vertsum infile outfile [CHECKED] - # AIR: sum over height levels - # anthro: sum over different sectors - - # ATTENTION only relevant for "monthly" data! - # cdo -setday,1 -settime,00:00:00 - # TODO add check for mon data only - if "mon" not in "": - chained_operators.remove(SET_DAY) - chained_operators.remove(SET_TIME) - LOGGER.warning("Tried to set the time axis on non-monthly file; removing related cdo operators for file.") - - # run commands - commands = ["cdo", "-s", "-w", "-L", "-P", str(threads), *chained_operators, input_file, output_file] - subprocess.call(commands) - - # unit_key_list = [ - # "units", "unit", - # "variable_units", "variable_unit", - # "var_units", "var_unit" - # ] - - # for k in unit_key_list: - # try: - # # TODO try to get the units - # pass - # # if successful: break loop - # except AttributeError: - # # TODO continue - # continue - # if unit is None: - # raise KeyError("Unit could not be found in file {}".format(input_file)) - # else: - # # TODO check if the unit is the one saved in the jason file - # calendar = self.meta_raw_dict["calendar"][var] - # pass - # simply raises an error in case we encounter units that we dont expect - - # TODO break this into two functions - def sum_sectors_subdir( - self, - sub_dir: Path, - output_dir: Path, - force_sum_sectors: list = None, - fire_cases: bool = False, - overwrite: bool = True, - silent: bool = False, - input_res: str = "50_km", - input_freq: str = "mon", - ): - """ - Args: - force_sum_sectors (str): Default is empty, i.e. no merge is forced. - Put the name of GHG here that should be forced to merge. - fire_cases (bool): Default False means that simply all the data is - summed up and that's it. In case this is set to true, three different - things are calculated: anthro-fires, no-fires, all-fires. Can - only be done if "em_openburning" or "em_biomassburning" files - are included. - overwrite (bool): if the files should be overwritten if they already exist - silent (bool): If this should be processed silently. - """ - print(f"Start summing up different types of emission files in {sub_dir}...") - if not force_sum_sectors: - force_sum_sectors = [] - # params of the files to sum over - # TODO make these params in a config - # TODO find out which files are skipped - # res = "50_km" - # freq = "mon" - # years = [] - # dicts needed to match files together - # "example_path_SECTOR_sth": [None, None, None] - ssp_file_dict = {} - historical_file_dict = {} - sector_idx = {"em_AIR_anthro": 0, "em_anthro": 1, "em_openburning": 2, "em_biomassburning": 2} - - # match all files that have the exact same name and end on - total_files = len(list(sub_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(sub_dir), total=total_files): - if len(files) > 0: - for file in files: - full_path = str(Path(path) / file) - # only process the files with the right res and frequency - if (input_res in full_path) and (input_freq in full_path): - # sector descriptions - # check if this is historical or ssp and match - if "ssp" in file: - sector = re.search(r"(em_AIR_anthro|em_anthro|em_openburning)", full_path).group(0) - sector_file_str = full_path.replace(sector, "SECTOR") - # add ghg group as key if they don't exist yet - if sector_file_str not in ssp_file_dict: - ssp_file_dict[sector_file_str] = [None, None, None] - # add the found sector at the right sector index - # ssp_file_dict[sector_file_str][sector_idx[sector]] = full_path - ssp_file_dict[sector_file_str][sector_idx[sector]] = sector - - elif "historical" in file: - sector = re.search(r"(em_AIR_anthro|em_anthro|em_biomassburning)", full_path).group(0) - sector_file_str = full_path.replace(sector, "SECTOR") - # add ghg group as key if they don't exist yet - if sector_file_str not in historical_file_dict: - historical_file_dict[sector_file_str] = [None, None, None] - # add the sector at the right sector index - # history_file_dict[sector_file_str][sector_idx[sector]] = full_path - historical_file_dict[sector_file_str][sector_idx[sector]] = sector - - else: - raise ValueError( - "Input4mips files must include the scenario ('ssp' or 'historical') in their name." - ) - - self._sum_sector_cases( - file_dict=ssp_file_dict, - output_dir=output_dir, - fire_cases=fire_cases, - force_sum_sectors=force_sum_sectors, - overwrite=False, - silent=silent, - ) - - self._sum_sector_cases( - file_dict=historical_file_dict, - output_dir=output_dir, - fire_cases=fire_cases, - force_sum_sectors=force_sum_sectors, - overwrite=False, - silent=silent, - ) - - print(f"...Finished summing up ghg emissions and stored results in {output_dir}") - - def _sum_sector_cases( - self, - file_dict: dict, - output_dir: Path, - fire_cases: bool, - force_sum_sectors: list = None, - overwrite: bool = False, - silent: bool = False, - ): - """ - Calls sum sector for different cases. - - Args: - file_dict (dict): "example_path_SECTOR_sth": [file_path, file_path, file_path] - output_dir (Path): output directory, where to save the diff cases. - fire_cases (bool): if the diff fire cases should be considered - force_sum_sectors (str): Default is empty, i.e. no merge is forced. - Put the name of GHG here that should be forced to merge. - overwrite (bool): if the files should be overwritten if they already exist - silent (bool): If this should be processed silently. - """ - if not force_sum_sectors: - force_sum_sectors = [] - - for file, sectors in tqdm(file_dict.items()): - num_nones = sum([sec is None for sec in sectors]) - - # create file lists and names for the case where everything is added - ghg = file.split("input4mips/")[-1].split("/")[1].split("_")[0] - input_files = [Path(file.replace("SECTOR", sec)) for sec in sectors if sec is not None] - sector_list = [f"{ghg}_{sec}" for sec in sectors if sec is not None] - out_path = Path(output_dir / "input4mips" / file.split("input4mips/")[-1].replace("SECTOR", "sum")) - out_path.parent.mkdir(parents=True, exist_ok=True) - - if num_nones == 0: - # normal case - if fire_cases: - self.sum_sectors_fires(input_files, sector_list, out_path, overwrite=overwrite) - else: - self._sum_sectors(input_files, sector_list, out_path, overwrite=overwrite) - - elif (num_nones == 1) and (ghg in force_sum_sectors): - # just the simple summing if desired + warning - if not silent: - warnings.warn(f"Merged files for {file}, however some sectors are missing", stacklevel=2) - self._sum_sectors(input_files, sector_list, out_path, overwrite=overwrite) - - elif (num_nones == 1) and ghg not in force_sum_sectors: - if not silent: - warnings.warn( - "Skipping case {}. Not all three sectors are available. You can force merging with 'force_sum_sectors=[GHG1, GHG2]'.".format( - file - ), - stacklevel=2, - ) - - elif num_nones == 2: - # warning, skipping - if not silent: - warnings.warn( - f"Skipping case {file}. At least two files must exist to be summed together.", - stacklevel=2, - ) - else: - raise RuntimeError("'num_nones' must be between 0 and 3.") - - def set_meta_data(self, meta_data: Path): - """Set meta data path if not done during initialization.""" - self.meta_data = meta_data - - def create_anthro_fire_subdir(self, input_dir: Path, overwrite: bool = False): - """ - Creating anthropogenic fire files for a subdir. - - Args: - input_dir (Path): Input dir - overwrite (bool): If data should be overwritten. Default False. - """ - # loop through input dir, get biomassburning and openburning files - print(f"Starting to create anthropogenic fire files for files in {input_dir}.") - fire_output_dir = self.meta_data / "anthro-fire-data" - total_files = len(list(input_dir.rglob("*.nc"))) - for path, subdirs, files in tqdm(os.walk(input_dir), total=total_files): - if len(files) > 0: - for file in files: - input_file = Path(path) / file - # must match the right nominal and temporal resolution - if ("biomassburning" in file) and ("25_km" in file) and ("mon" in file): - type = "biomassburning" - elif ("openburning" in file) and ("50_km" in file) and ("mon" in file): - type = "openburning" - else: - continue - scenario = file.split("_")[1] - ghg = file.split("_")[2] - fire_name = file.replace("_em_", "_anthro_") - output_file = Path(fire_output_dir / type / scenario / ghg / fire_name) - output_file.parent.mkdir(parents=True, exist_ok=True) - if (not output_file.is_file()) or overwrite: - self._create_anthro_fire_file(input_file, type, output_file) - - print(f"... Finished creating anthro fire files and stored them in {fire_output_dir}") - - def _create_anthro_fire_file(self, input_file: Path, type: str, output_file: Path, threads: int = 1): - """ - Create a new openburning or biomassburning file from a given file. - - Returns the path where the new file lifes. - Args: - input_file (Path): - type (str): either 'biomassburning' or 'openburning' - output_file (Path): Where to store the anthro fire file - threads (int): threads for cdo - """ - if self.meta_data is None: - print("You need to provide meta data for this function. Use set_meta_data func for that.") - file_parts = str(input_file.stem).split("_") - scenario, ghg, year = file_parts[1], file_parts[2], file_parts[-1] - meta_dir = self.meta_data / type - - # prepare additional operators (calendar and time-axis) for later compatability - chained_operators = [] - # correct calendar and time axis right away if necessary - if self.correct_time_axis: - chained_operators.append("-setday,1") - chained_operators.append("-settime,00:00:00") - if self.correct_calendar: - # default calendar: 365_day - chained_operators.append(f"-setcalendar,{self.calendar}") - - if type == "openburning": - # find a file that contains openburning, scenario and ghg in the meta_data / "future-openburning" - meta_dir = self.meta_data / "future-openburning" - matched_files = [ - f for f in meta_dir.rglob("*.nc") if (scenario in str(f)) and (type in str(f)) and (ghg in str(f)) - ] - # take first file that matches that - raw_percentage_file = matched_files[0] - tmp_perc_file = input_file.parents[0] / f"tmp_percentage_{year}.nc" - - # 1. drop the years you don't need from the meta file. -selyear,2015 - # 2. drop the levels you dont need - sellevel,0,1 - # 3. sum the remaining levels / sectors -vertsum - # -> cdo -vertsum -sellevel,0,1 -selyear,year meta_input.nc 2015_share_output.nc - commands = [ - "cdo", - "-w", - "-s", - "-L", - "-P", - str(threads), - "-vertsum", - "-sellevel,0,1", # 0 is agriculture, 1 is deforestation - f"-selyear,{year}", - raw_percentage_file, - tmp_perc_file, - ] - subprocess.call(commands) - - # multiply -> cdo mul fire_file.nc percentage_file new_anthro_fire_file.nc - # [IMPORTANT first file must be input-file -> gets meta data from first input file!] - commands = [ - "cdo", - "-w", - "-s", - "-L", - "-P", - str(threads), - *chained_operators, - "-mul", - input_file, - tmp_perc_file, - output_file, - ] - subprocess.call(commands) - - # remove 2015_share_output file to save space (unlink) - tmp_perc_file.unlink() - - elif type == "biomassburning": - meta_dir = self.meta_data / "historic-biomassburning" - matched_files = [ - f for f in meta_dir.rglob("*.nc") if (type in str(f)) and (ghg in str(f)) and (year in str(f)) - ] - - # find the agri / defo files (written this way for later extension) - perc_dict = { - "AGRI": None, - "DEFO": None, - } - for f in matched_files: - for key in perc_dict.keys(): - if key in str(f): - perc_dict[key] = f - if (perc_dict["AGRI"] is None) or (perc_dict["DEFO"] is None): - warnings.warn("The AGRI or DEFO file does not exist, see dictionary: \n", stacklevel=2) - print(perc_dict) - print("... skipping this file.") - return - - # cdo 'expr,percentage_AGRI=percentage_AGRI/100' add percentage_AGRI_year.nc percentage_DEFO_year.nc percentage_ANTHRO_year.nc - tmp_anthro_perc_file = input_file.parents[0] / f"tmp_percentage_anthro_{year}.nc" - commands = [ - "cdo", - "-w", - "-s", - "-L", - "-P", - str(threads), - "expr,percentage_AGRI=percentage_AGRI/100;", - "-add", - perc_dict["AGRI"], - perc_dict["DEFO"], - tmp_anthro_perc_file, - ] - subprocess.call(commands) - - # cdo mul input_file.nc percentage_ANTHRO_year.nc output.nc - commands = [ - "cdo", - "-w", - "-s", - "-L", - "-P", - str(threads), - *chained_operators, - "-mul", - input_file, - tmp_anthro_perc_file, - output_file, - ] - subprocess.call(commands) - - # delete temp files - tmp_anthro_perc_file.unlink() - - # BIOMASSBURNING - # TODO get the right biomassburning files: year, GHG, biomassburning and AGRI + DEFO - # 1. cdo 'expr,percentage_AGRI=percentage_AGRI/100' -selyear,year meta_input.nc percentage_AGRI_year.nc - # 2. cdo 'expr,percentage_DEFO=percentage_DEFO/100' -selyear,year meta_input.nc percentage_DEFO_year.nc - # 3. cdo add percentage_AGRI_year.nc percentage_DEFO_year.nc percentage_ANTHRO_year.nc - # 4. cdo mul input_file.nc percentage_ANTHRO_year.nc output.nc - # 5 remove old files: all the percentage files - else: - raise ValueError("Type must be either 'openburing' or 'biomassburning'.") - - def sum_sectors_fires(self, input_files: list, sectors: list, output_path: Path, overwrite: bool = True): - """ - Args: - input_files (list): List of paths. The paths contain different - sectors and the information is summed up and stored in the - desired output_file. - sectors (list): List of strings. The different files that are stored - in input_files. The variable names of input_files. - output_path (Path): File where the resulting emissions are stored. - The output_path is adapted in case different fire cases are created. - overwrite (bool): Default is True, i.e. data will be overwritten if it already exists - - """ - # check where (and if) biomassburning or openburning are contained - fire_idx = None - for i, var in enumerate(sectors): - if ("openburning" in var) or ("biomassburning" in var): - fire_idx = i - if fire_idx is None: - raise ValueError( - "Fire cases can only be created if openburning or biomassburning files are given. (Must be contained in the file name as well)." - ) - - # Get ghg and fire type - ghg = sectors[0].split("_")[0] - # fire_type = sectors[fire_idx].split("_")[-1] - - # CASE 1: ALL FIRES - all_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_all-fires_")) - all_fire_files = input_files - all_fire_sectors = sectors - self._sum_sectors(all_fire_files, all_fire_sectors, all_fire_output_path, overwrite=overwrite) - - # CASE 2: ANTHRO FIRES - anthro_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_anthro-fires_")) - anthro_fire_sectors = sectors - # get the right anthro fire file from meta - # 1. get the current fire file - all_fire_file = input_files[fire_idx] - anthro_file_name = str(all_fire_file.name).replace("_em_", "_anthro_") - potential_anthro_files = [f for f in self.meta_data.rglob(anthro_file_name)] - if len(potential_anthro_files) < 1: - print(f"No anthro fire file found for {all_fire_file.name}.") - pass - else: - if len(potential_anthro_files) > 1: - print(f"Several fire files found for {all_fire_file.name}. Choosing only the first one.") - anthro_fire_files = input_files.copy() - anthro_fire_files[fire_idx] = potential_anthro_files[0] - self._sum_sectors(anthro_fire_files, anthro_fire_sectors, anthro_fire_output_path, overwrite=overwrite) - - # CASE 3: NO FIRES - # drop the openburning or biomassburning sectors and files - no_fire_output_path = Path(str(output_path).replace(f"_{ghg}_", f"_{ghg}_no-fires_")) - no_fire_files = input_files.copy() - no_fire_sectors = sectors.copy() - no_fire_files.pop(fire_idx) - no_fire_sectors.pop(fire_idx) - self._sum_sectors(no_fire_files, no_fire_sectors, no_fire_output_path, overwrite=overwrite) - - def _sum_sectors( - self, input_files: list, sectors: list, output_path: Path, overwrite: bool = True, threads: str = 1 - ): - """ - Args: - input_files (list): List of paths. The paths contain different - sectors and the information is summed up and stored in the - desired output_file. - sectors (list): List of strings. The different files that are stored - in input_files. The variable names of input_files. - output_path (Path): File where the resulting emissions are stored. - The output_path is adapted in case different fire cases are created. - overwrite (bool): True, i.e. if the file in output_path already - exists, it is overwritten. Set to False if you don't wanna overwrite - these files. - threads (str): Default 1. - - """ - # exit this if the file already exists and we dont wanna overwrite them - if output_path.is_file() and (not overwrite): - return - - ghg = sectors[0].split("_")[0] - # math sum expression - sum_expr = f"{ghg}={sectors[0]}" - for sec in sectors[1:]: - sum_expr = f"{sum_expr}+{sec}" - - # cdo expr,'var_new=var1+var2;' -merge infile1 infile2 outfile # works only on cmd - # cdo 'expr,var_new=var1+var2;' -merge infile1 infile2 outfile # works for both - commands = [ - "cdo", - "-w", - "-s", - "-L", - "-P", - str(threads), - f"expr,{sum_expr};", - "-merge", - *input_files, - output_path, - ] - subprocess.call(commands) diff --git a/climateset/processing/raw/utils.py b/climateset/processing/raw/utils.py index cfaeadd..0ad1047 100644 --- a/climateset/processing/raw/utils.py +++ b/climateset/processing/raw/utils.py @@ -1,5 +1,9 @@ from pathlib import Path +from climateset.utils import create_logger + +LOGGER = create_logger(__name__) + def create_generic_output_path(output_dir: Path, path: str, file: str) -> Path: """Creates an output path and the necessary parent directories. diff --git a/tests/test_processing/test_raw_processing.py b/tests/test_processing/test_raw_processing.py index 276eb57..3f58f85 100644 --- a/tests/test_processing/test_raw_processing.py +++ b/tests/test_processing/test_raw_processing.py @@ -5,7 +5,7 @@ from climateset import DATA_DIR from climateset.processing.raw.checker import BasicDirectoryChecker -from climateset.processing.raw.input4mips_processing import ( +from climateset.processing.raw.input4mips.input4mips_processing import ( AVAILABLE_INPUT4MIPS_PROCESSING_STEPS, Input4MipsEmissionProcesser, ) @@ -48,7 +48,7 @@ def test_emission_processing_add_processing_steps(simple_emission_processing_obj def test_emission_processing_add_individual_steps(simple_emission_processing_object): simple_emission_processing_object.add_correct_names_step() - simple_emission_processing_object.add_fix_co2_step() + simple_emission_processing_object.add_reorder_ssp_co2_dimensions_step() simple_emission_processing_object.add_create_fire_files() simple_emission_processing_object.add_correct_units_step() simple_emission_processing_object.add_correct_calendar_step() diff --git a/tests/test_processing/test_utils.py b/tests/test_processing/test_utils.py new file mode 100644 index 0000000..f4ef867 --- /dev/null +++ b/tests/test_processing/test_utils.py @@ -0,0 +1,11 @@ +from climateset import TEST_DIR +from climateset.processing.raw.input4mips.processing import rename_biomassburning_file + + +def test_rename_biomassburning_files(): + test_file = ( + TEST_DIR / "resources/CO2-em-biomassburning_input4MIPs_emissions_CMIP_VUA-CMIP-BB4CMIP6-1-1_gn_185001-201512.nc" + ) + result = rename_biomassburning_file(test_file) + print(result) + assert False From d496ff06ec1f66208c3290df5a6054ba270a81da Mon Sep 17 00:00:00 2001 From: f-PLT Date: Tue, 26 Nov 2024 11:13:50 -0500 Subject: [PATCH 05/10] Add TODO comment --- climateset/processing/raw/cmip6_processor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/climateset/processing/raw/cmip6_processor.py b/climateset/processing/raw/cmip6_processor.py index 0b43beb..bcac9ad 100644 --- a/climateset/processing/raw/cmip6_processor.py +++ b/climateset/processing/raw/cmip6_processor.py @@ -1,3 +1,4 @@ +# TODO Adapt module for new pipeline pattern import os import re import subprocess From e02b25b27afd793d5111331aa6e76f65879b96c5 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Tue, 26 Nov 2024 11:14:19 -0500 Subject: [PATCH 06/10] Add generic step processor --- climateset/processing/abstract_processor_step.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/climateset/processing/abstract_processor_step.py b/climateset/processing/abstract_processor_step.py index 33c2d3c..42b0ff4 100644 --- a/climateset/processing/abstract_processor_step.py +++ b/climateset/processing/abstract_processor_step.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from pathlib import Path class AbstractProcessorStep(ABC): @@ -11,3 +12,12 @@ def execute(self, input_directory): def get_results_directory(self): return self.results_directory + + +def process_steps(input_directory: Path, list_of_steps: list[AbstractProcessorStep]): + current_input_directory = input_directory + for step in list_of_steps: + step.execute(current_input_directory) + # current_input_directory = step.get_results_directory() + + return current_input_directory From 5b073b9c62af729c44af90387cc5860948880386 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Tue, 26 Nov 2024 11:14:50 -0500 Subject: [PATCH 07/10] Cleanup abstract_raw_processor.py for new pipeline processing --- .../processing/raw/abstract_raw_processor.py | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/climateset/processing/raw/abstract_raw_processor.py b/climateset/processing/raw/abstract_raw_processor.py index 3f57f6e..4ab0644 100644 --- a/climateset/processing/raw/abstract_raw_processor.py +++ b/climateset/processing/raw/abstract_raw_processor.py @@ -53,28 +53,9 @@ def type_class_meta(self) -> str: """Returns the name tag of the subclass.""" @abstractmethod - def process_directory( - self, - input_dir: Path, - cleaned_dir: Path, - processed_dir: Path, - load_dir: Path, - overwrite: bool, - silent: bool, - sum_sec_input_res: str, - sum_sec_input_freq: str, - ): + def process_directory(self): """ Preprocessing a subdir - must be implemented for each subclass. - Args: - input_dir: - cleaned_dir: - processed_dir: - load_dir: - overwrite: - silent: - sum_sec_input_res: - sum_sec_input_freq: Returns: From be8f29e2dad2bdf44be3cd834ec762c885e68560 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Tue, 26 Nov 2024 11:15:12 -0500 Subject: [PATCH 08/10] Fix missing arguments --- .../raw/input4mips/input4mips_processor.py | 84 ++++++++++--------- 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/climateset/processing/raw/input4mips/input4mips_processor.py b/climateset/processing/raw/input4mips/input4mips_processor.py index c15368d..bab2dbf 100644 --- a/climateset/processing/raw/input4mips/input4mips_processor.py +++ b/climateset/processing/raw/input4mips/input4mips_processor.py @@ -76,6 +76,7 @@ def __init__( spat_load: str = "map", temp_load: str = "mon", ghg_list: list = None, + ghg_sum_name: str = None, meta_data: Path = None, fire_cases: bool = True, force_sum_sectors: list = None, @@ -156,6 +157,7 @@ def __init__( self.ghg_list = ghg_list if not self.ghg_list: self.ghg_list = ["BC_sum", "CH4_sum", "SO2_sum"] + self.ghg_sum_name = ghg_sum_name self.meta_data = meta_data self.fire_cases = fire_cases self.force_sum_sectors = force_sum_sectors @@ -165,9 +167,14 @@ def __init__( self.input_res = input_res self.available_steps = { - REORDER_SSP_CO2: {ORDER: 1, PROCESSOR_STEP: RenameBiomassBurningFilesStep()}, - CORRECT_NAMES: {ORDER: 2, PROCESSOR_STEP: ReorderSSPCO2DimensionsStep()}, - CREATE_FIRE_FILES: {ORDER: 3, PROCESSOR_STEP: CreateAnthroFireFilesStep()}, + CORRECT_NAMES: {ORDER: 1, PROCESSOR_STEP: RenameBiomassBurningFilesStep()}, + REORDER_SSP_CO2: {ORDER: 2, PROCESSOR_STEP: ReorderSSPCO2DimensionsStep()}, + CREATE_FIRE_FILES: { + ORDER: 3, + PROCESSOR_STEP: CreateAnthroFireFilesStep( + metadata_directory=self.meta_data, cdo_operators=self.cdo_operators + ), + }, CORRECT_CALENDAR: {ORDER: 4, PROCESSOR_STEP: self._cdo_add_correct_calendar}, CORRECT_TIME_AXIS: {ORDER: 5, PROCESSOR_STEP: self._cdo_add_correct_time_axis}, SUM_LEVELS: {ORDER: 6, PROCESSOR_STEP: self._cdo_add_sum_levels}, @@ -187,7 +194,12 @@ def __init__( input_freq=self.input_freq, ), }, - MERGE_GHG: {ORDER: 11, PROCESSOR_STEP: MergeGHG()}, + MERGE_GHG: { + ORDER: 11, + PROCESSOR_STEP: MergeGHG( + working_dir=self.working_directory, ghg_list=self.ghg_list, ghg_sum_name=self.ghg_sum_name + ), + }, } # TODO CO2 baseline - where?? @@ -256,48 +268,34 @@ def file_belongs_to_type(self, input_file: Path) -> bool: """ return bool(re.search("input4mip", str(input_file), re.IGNORECASE)) - def process_directory( - self, - input_dir: Path, - cleaned_dir: Path, - processed_dir: Path, - load_dir: Path, - overwrite: bool = False, - silent: bool = False, - sum_sec_input_res: str = "50_km", - sum_sec_input_freq: str = "mon", - ): + def process_directory(self): """ - Applying all the stuff as decided in the init function. Most of the functions build upon each other, i.e. you - cannot apply store_totals if you haven't used merge_ghg before. To keep things modular the user can still decide - which ones to apply (e.g. because some things may have been applied earlier / are not necessary). Just be aware - that things break if you are not making sure that the order is followed. + Applying all the stuff as decided in the init function. - Args: - input_dir (Path): To which directory the processing should be applied. - cleaned_dir (Path): Where cleaned data should be stored. This is used - for the "preprocess" steps. - processed_dir (Path): Where processed data is stored. This is used - for sum_sectors. Simple preprocessing is directly applied on - raw data here. - load_dir (Path): Where data is stored that is strongly modified and - ready to be loaded. This is used for merge_ghg and store_totals. - overwrite (bool): If the data should be overwritten if it already - exists. Default False. - silent (bool): If this should be processed silently. - sum_sec_input_res: - sum_sec_input_freq: + Most of the functions build upon each other, i.e. you cannot apply store_totals if you haven't used merge_ghg + before. To keep things modular the user can still decide which ones to apply (e.g. because some things may have + been applied earlier / are not necessary). Just be aware that things break if you are not making sure that the + order is followed. """ - if any([process in self.processing_steps for process in [CORRECT_CALENDAR, CORRECT_TIME_AXIS, SUM_LEVELS]]): + special_processes = [ + process for process in self.processing_steps if process in [CORRECT_CALENDAR, CORRECT_TIME_AXIS, SUM_LEVELS] + ] + if special_processes: + for process in special_processes: + self.available_steps[process][PROCESSOR_STEP]() + self.processing_steps.remove(process) self.processing_steps.append(_CDO_PROCESSING) - process_list = [self.available_steps[step] for step in self.available_steps] + process_list = [self.available_steps[step] for step in self.processing_steps] ordered_processes = sorted(process_list, key=lambda step: step[ORDER]) - current_processing_directory = input_dir - for _, process in ordered_processes: - output_dir = process[PROCESSOR_STEP](current_processing_directory) - current_processing_directory = output_dir + current_processing_directory = self.input_directory + for process_dict in ordered_processes: + processing_step: AbstractProcessorStep = process_dict[PROCESSOR_STEP] + processing_step.execute(current_processing_directory) + current_processing_directory = processing_step.get_results_directory() + + return current_processing_directory def _cdo_add_correct_calendar(self): self.cdo_operators.append(f"{CDO_SET_CALENDAR},{self.calendar}") @@ -332,14 +330,18 @@ def execute(self, input_directory): class CreateAnthroFireFilesStep(AbstractProcessorStep): - def __init__(self, metadata_directory, calendar, overwrite=False): + def __init__(self, metadata_directory, cdo_operators, overwrite=False): super().__init__() self.metadata_directory = metadata_directory + self.cdo_operators = cdo_operators self.overwrite = overwrite def execute(self, input_directory): self.results_directory = create_anthro_fire_directory( - input_directory, meta_data=self.metadata_directory, overwrite=self.overwrite + input_directory, + meta_data=self.metadata_directory, + cdo_operators=self.cdo_operators, + overwrite=self.overwrite, ) From 6d8e074d5dfa323871d1fe05f2067d72e75115ac Mon Sep 17 00:00:00 2001 From: f-PLT Date: Tue, 26 Nov 2024 11:15:37 -0500 Subject: [PATCH 09/10] Add small pipeline example script --- scripts/example_input4mips_processing.py | 50 ++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 scripts/example_input4mips_processing.py diff --git a/scripts/example_input4mips_processing.py b/scripts/example_input4mips_processing.py new file mode 100644 index 0000000..f7e2e1b --- /dev/null +++ b/scripts/example_input4mips_processing.py @@ -0,0 +1,50 @@ +from pathlib import Path + +from climateset.processing.abstract_processor_step import process_steps +from climateset.processing.raw.input4mips.input4mips_processor import ( + CorrectUnitsStep, + CreateAnthroFireFilesStep, + Input4MipsEmissionProcessor, + RenameBiomassBurningFilesStep, + ReorderSSPCO2DimensionsStep, +) + +# +# Simple example with Input4MipsEmissionProcessor +# + +# Stand in for arguments +processor_arguments = {"input_directory": ""} + +# Create processor for a specific directory +processor = Input4MipsEmissionProcessor(**processor_arguments) + +# Add processing steps - These will be ordered by Input4MipsEmissionProcessor +processor.add_correct_calendar_step() +processor.add_reorder_ssp_co2_dimensions_step() +processor.add_correct_names_step() +processor.add_correct_units_step() +processor.add_create_fire_files() + +# Execute steps +processor.process_directory() + +# +# More manual example with manual setting of steps +# + +# Stand ins for arguments +input_directory = Path("SOME_PATH_HERE/") +anthrofire_arguments = {"metadata_directory": "", "cdo_operators": []} +correct_units_arguments = {"desired_units": ""} + +# Create steps +step_1 = RenameBiomassBurningFilesStep() +step_2 = ReorderSSPCO2DimensionsStep() +step_3 = CorrectUnitsStep(**correct_units_arguments) +step_4 = CreateAnthroFireFilesStep(**anthrofire_arguments) + +process_list = [step_1, step_2, step_3, step_4] + +# Execute steps +output_path = process_steps(input_directory=input_directory, list_of_steps=process_list) From 1889740dccd62ece1c31e826894ff25cd2fbfce0 Mon Sep 17 00:00:00 2001 From: f-PLT Date: Fri, 10 Jan 2025 11:44:20 -0500 Subject: [PATCH 10/10] Rename abstract_processor_step.py to processor_step.py --- .../{abstract_processor_step.py => processor_step.py} | 0 climateset/processing/raw/input4mips/input4mips_processor.py | 4 +++- scripts/example_input4mips_processing.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) rename climateset/processing/{abstract_processor_step.py => processor_step.py} (100%) diff --git a/climateset/processing/abstract_processor_step.py b/climateset/processing/processor_step.py similarity index 100% rename from climateset/processing/abstract_processor_step.py rename to climateset/processing/processor_step.py diff --git a/climateset/processing/raw/input4mips/input4mips_processor.py b/climateset/processing/raw/input4mips/input4mips_processor.py index bab2dbf..d177559 100644 --- a/climateset/processing/raw/input4mips/input4mips_processor.py +++ b/climateset/processing/raw/input4mips/input4mips_processor.py @@ -1,9 +1,11 @@ +"""This Module contains a Processor class that automates ProcessorStep processes, as well as individual ProcessorStep +classes that can be used independently for more customized applications.""" import re from pathlib import Path from typing import Union from climateset import PROCESSED_DATA -from climateset.processing.abstract_processor_step import AbstractProcessorStep +from climateset.processing.processor_step import AbstractProcessorStep from climateset.processing.raw.abstract_raw_processor import AbstractRawProcessor from climateset.processing.raw.checker import AbstractDirectoryChecker from climateset.processing.raw.input4mips.processing import ( diff --git a/scripts/example_input4mips_processing.py b/scripts/example_input4mips_processing.py index f7e2e1b..7562450 100644 --- a/scripts/example_input4mips_processing.py +++ b/scripts/example_input4mips_processing.py @@ -1,6 +1,6 @@ from pathlib import Path -from climateset.processing.abstract_processor_step import process_steps +from climateset.processing.processor_step import process_steps from climateset.processing.raw.input4mips.input4mips_processor import ( CorrectUnitsStep, CreateAnthroFireFilesStep,