Source code for mspasspy.util.Janitor

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Nov 13 14:56:53 2024

@author: pavlis
"""
import os
import yaml
from mspasspy.ccore.utility import MsPASSError
from mspasspy.ccore.seismic import (
    TimeSeries,
    Seismogram,
    TimeSeriesEnsemble,
    SeismogramEnsemble,
    _CoreTimeSeries,
)


[docs]class Janitor: """ Generic handler to clean up the Metadata namespace of a MsPASS data object. In data processing it is common for Metadata attributes to become inconsistent with the data. An example is a "chan" attribute makes no sense if the data have passed through an algorithm to convert a set of TimeSeries objects into Seismogram objects. The name of the class is meant as a memory device that the object is used to clear attributes that are junk/trash that need to be removed. There are two fundamentally different conceptual ways for the Janitor to handle the trash. First, is to discard them forever. That is the approach of the method called `clean`. When `clean` is called on a datum the inconsistent attributes (trash) are thrown away (like garbage sent to a landfill). The alternative is the to bag up the garbage and put it somewhere until you are ready to deal with it. That s the idea of the two methods called `collect_trash` and `bag_trash`. `collect_trash` removes trash attributes from the object but returns the trash in a container (implmented a dictonary). The `bag_trash` takes the result from `collect_trash` and posts it back to the object as a subdocument (a python dictionary) with a specified key. That mode can be useful for an experimental algorithm where you may need to pull some trash from the bag later but need to get the debris out of the way for understanding. Handling of ensemles is potentially ambiguous as both the ensemble container itself and all the members have a Metadata container. For that reason the class has a seperate set of keys that define attributes to be retained for ensembles. The default for enssembles is an empty list because in most cases the ensemble metadata is loaded from a normalizing collection and does not need to be retained. (e.g. source attributes for a common source gather) This class can be thought of as an inline version of the `clean` and `clean_collection` methods of :py:class:`mspasspy.db.database.Database`. That is, the database versions can be used to do a similar operation of data previously stored in MongoDB. This methds of this class would normally occur as the function in a map operator or in an assignment in a serial loop. The defaults for the class are designed to be appropriate for stock use. The variable args for the constructor can be used to override the list of attribute keys that the Janitor should treat as not junk. The default are loaded from a yaml file. You can also change the namespace by specifying an alterate name for the yaml file. The file is expected to contain keys to retain with the dictionary keys "TimeSeries" and "Seismogram". See the default in mspass/data/yaml/Janitor.yaml to see the format. :param keeper_file: yaml file containing the keys to be retained for TimeSeries and Seismogram objects. :type keeper_file: string defining yaml file name. Default is None which assumed a default path of $MSPASS_HOME/data/yaml/Janitor.yaml. :param TimeSeries_keepers: Use to override list defined in yaml file for TimeSeries objects. If defined, it should be list of attributes to be retained. Use this option with caution as the list is not checked for required Metadata. Use the default yaml file for guidance. :type TimeSeries_keepers: assumeed to be a list of strings of attributes to be retained for TimeSeries objects. Default is None which causes this argument to be ignored and using the yaml file to define the list of attributes to be retained. :param Seismogram_keepers: Use to override list defined in yaml file for Seismogram objects. If defined, it should be a list of attributes to be retained. Use this option with caution as the list is not checked for required Metadata. Use the default yaml file for guidance. :type Seismogram_keepers: assumeed to be a list of strings of attributes to be retained. Default is None which causes this argument to be ignored and using the yaml file to define the list of attributes to be retained. :param ensemble_keepers: Use to override the content of a yaml file. If defined it replaces the yaml file defnition of what attribute keys should be retained. :type ensemble_keepers: list of strings defining keys of attributes that should not be treated as junk and retained. :param process_ensemble_members: boolean controlling behavior with ensemble objects. With ensembles there are two possible definitions of what Metadata container is to be cleaned. That is, the ensemble itself has a Metadata container and each atomic member has a Metdata container. When True the cleaning opeators are applied to the members. When False the ensemble container is handled if the datum is an ensemble. This argument is ignored when processing atomic data. """ def __init__( self, keepers_file=None, TimeSeries_keepers=None, Seismogram_keepers=None, ensemble_keepers=None, process_ensemble_members=True, ): if keepers_file is None: if "MSPASS_HOME" in os.environ: keepers_file = ( os.path.abspath(os.environ["MSPASS_HOME"]) + "/data/yaml/Janitor.yaml" ) else: keepers_file = os.path.abspath( os.path.dirname(__file__) + "/../data/yaml/Janitor.yaml" ) elif not os.path.isfile(keepers_file): if "MSPASS_HOME" in os.environ: keepers_file = os.path.join( os.path.abspath(os.environ["MSPASS_HOME"]), "data/yaml", keepers_file, ) else: keepers_file = os.path.abspath( os.path.join(os.path.dirname(__file__), "../data/yaml", keepers_file) ) try: with open(keepers_file, "r") as stream: keepers_dict = yaml.safe_load(stream) except yaml.YAMLError as e: raise MsPASSError( "Janitor constructor: Parser failed reading keepers_file=" + keepers_file, "Fatal", ) from e except EnvironmentError as e: raise MsPASSError( "Janitor constructor: Cannot open keepers_file = " + keepers_file, "Fatal", ) from e self._parse_yaml_file(keepers_dict) self.process_ensemble_members = process_ensemble_members if TimeSeries_keepers: self.TimeSeries_keepers = TimeSeries_keepers if Seismogram_keepers: self.Seismogram_keepers = Seismogram_keepers if ensemble_keepers: self.ensemle_keepers = ensemble_keepers
[docs] def clean(self, datum): """ Process datum to remove all Metadata with keys not defined in this instance of Janitor. Returns the datum with the attributes it treats as junk removed. For ensembles if self.process_ensemble_members s True the operation will be appplied on all ensemble members. When False the Metadata container will be altered. :param datum: data objet to be processed. """ dtype = self._validate_datum(datum) if datum.dead(): return datum if dtype == "ensemble": for k in datum.keys(): if k not in self.ensemble_keepers: datum.erase(k) if self.process_ensemble_members: for i in range(len(datum.member)): datum.member[i] = self.clean(datum.member[i]) else: # this uses _CoreTimeSeries as a base class to TimeSeries # to be more robust - resolves True fr a TimeSeries if isinstance(datum, _CoreTimeSeries): keepers = self.TimeSeries_keepers else: # currently this can only be Seismogram - if data # types supported is exteded needs a change keepers = self.Seismogram_keepers for k in datum.keys(): if k not in keepers: datum.erase(k) return datum
[docs] def collect_trash(self, datum) -> dict: """ Processes datum by extracting attributes that this instance of Janitor does not define as a keeper. It then clears the attributes it treats as junk before returning the attributes it cleared in a python dictionary. When run on ensembles the self.ensemble_keeper slist is used to edit the ensemble Metadata container. When datum is an ensemble the ensemble members are not altered. """ dtype = self._validate_datum(datum) if datum.dead(): return datum if dtype == "ensemble": keepers = self.ensemble_keepers else: # this uses _CoreTimeSeries as a base class to TimeSeries # to be more robust - resolves True fr a TimeSeries if isinstance(datum, _CoreTimeSeries): keepers = self.TimeSeries_keepers else: # currently this can only be Seismogram - if data # types supported is exteded needs a change keepers = self.Seismogram_keepers result = dict() for k in datum.keys(): if k not in keepers: result[k] = datum[k] datum.erase(k) return result
[docs] def bag_trash( self, datum, trashbag_key="trash", ensemble_trashbag_key="ensemble_trash" ): """ This method bundles up trash in a python dictionary and posts it back to the datum with a specified key. This allows the trash data to be retained but put into the auxiliary trash bag container to simplify the datum's Metadata namespace. The idea is to allow an algorithm to pull junk from the trashcan if necesary at a later stage. Note for ensembles there are two different entities that are handled separately. Any attributes in the ensemble's Metadata are processed against self.ensemble_keepers and any junk is bagged into a dictionary stored back in to the ensemble's Metadata container with the key defined by the ensemble_trashbag_key. Members are only processed if self.prococss_ensemble_members is True. When True the members will also be passed through this method with the handling depending on the type of the members. :param datum: MsPASS data objet to be processed. :type datum: Must be a MsPASS data object (`TimeSeries`,`Seismogram`, `TimeSeriesEnsemble`, or `SeismogramEnsemle`) or this method will raise a MsPASSError exception marked Fatal. :param trashbag_key: dictinary key used to post the trash bag for atomic data or the members of ensembles when self.process_ensemble_members is True. :type trashbag_key: str (default "trash") :param ensemble_trashbag_key: dictionary key to use post the trashbag dictionary constructed from an ensemble's Metadata container. Ignored for atomic data. Note this key should be distinct from trashbag_key or the member trashbags will be overwritten by the ensemble trashbag if the ensemble is saved. :type ensemble_trashbag_key: str (default "ensemble_trash" """ dtype = self._validate_datum(datum) if datum.dead(): return datum if dtype == "ensemble": trash = self.collect_trash(datum) datum[ensemble_trashbag_key] = trash if self.process_ensemble_members: for i in range(len(datum.member)): trash = self.collect_trash(datum.member[i]) datum.member[i][trashbag_key] = trash else: trash = self.collect_trash(datum) datum[trashbag_key] = trash return datum
[docs] def add2keepers(self, key, keeper_type="atomic"): """ Adds a new key to the namespace for a data type. It is often useful to extend the namespace during processing without having to create a special instance of a Janitor from a yaml file. Use this method to add a key to the list of attributes defined as a keeper. :param key: key of the attribute to add to a the list of keeper names in this Janitor. Note if the name already exists in the list this method does nothing. :type key: str :param keeper_type: defines the data to to which key should be added. Normal use is one the following keywword whose use should be clear: "TimeSeries","Seismogam", and "ensemble". Also accepts the special keyword "atomic" which means the key is added to the list of keepers for both TimeSeries and Seismogram objects. :type keeper_type: str (default "atomic") """ if keeper_type == "atomic": self.add2keepers(key, "TimeSeries") self.add2keepers(key, "Seismogram") elif keeper_type == "TimeSeries": if key not in self.TimeSeries_keepers: self.TimeSeries_keepers.append(key) elif keeper_type == "Seismogram": if key not in self.Seismogram_keepers: self.Seismogram_keepers.append(key) elif keeper_type == "ensemble": if key not in self.ensemble_keepers: self.ensemble_keepers.append(key) else: message = "Janitor.add2keepers: illegal value for keeper_type={}".format( keeper_type ) raise ValueError(message)
def _parse_yaml_file( self, keepers_dict, TimeSeries_key="TimeSeries", Seismogram_key="Seismogram", ensemble_key="Ensemble", ): """ Internal method to parse dictionary returned by pyyaml parsing of a specified keepers definition. kwarg vales specify keys for lists for TimeSeries and Seismogram the parser expects to see. This method will throw an exception if either are missing. Normally called only by constructor, but could be used to redefine a Janitor with a parsed result from a different yaml file. Normally that would be silly, however, as the same thing would be clearer with just redefining the instance of Janitor. This was made a method largely to encapsulate the translation step the yaml dict to the pair of lists the constructor needs to define. """ if TimeSeries_key in keepers_dict and Seismogram_key in keepers_dict: self.TimeSeries_keepers = keepers_dict[TimeSeries_key] self.Seismogram_keepers = keepers_dict[Seismogram_key] else: message = "Janitor._parse_yaml_file: " message += "Missing required keys = {} and {} for TimeSeries and Seismogram object keepers defintions respectively".format( TimeSeries_key, Seismogram_key ) raise MsPASSError(message, "Fatal") if ensemble_key in keepers_dict: self.ensemble_keepers = keepers_dict[ensemble_key] else: self.ensemble_keepers = [] def _validate_datum(self, datum) -> str: """ Called internally by processing methods to validate the type of a datum input to processing methods through required arg0. Returns "atomic" if datum is an atomic datum (TimeSeries or Seismogram) or "ensemble" if the datum is a valid ensemble object (TimeSeriesEnsemble or SeismgoramEnsemble). Will raise a MsPASSError if the datum is not a valid mspass data object. """ if isinstance(datum, (TimeSeries, Seismogram)): return "atomic" elif isinstance(datum, (TimeSeriesEnsemble, SeismogramEnsemble)): return "ensemble" else: message = "Cannot process input datum of type={}".format(str(type(datum))) raise MsPASSError(message, "Fatal")
[docs]class MiniseedJanitor(Janitor): """ Convenience class for handling data read from wf_miniseed. Data loaded from miniseed files in MsPASS tend to have some debris that is not needed once the data are loaded into memory for processng. This is a conveniene class that loads a different yaml file to create a stock Janitor for handling data loaded from wf_miniseed. WARNING: use this class only on data immediately after loading from wf_miniseed. It will eliminate miniseed specific debris that is a perfect example of why a Janitor is useful. It should not, however, be used after any processing that loads additional attributes or it will almost certainly delete useful attributes. """ def __init__(self): super().__init__("MiniseedJanitor.yaml")