Source code for mspasspy.preprocessing.css30.dbarrival

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Oct 24 06:31:28 2020

@author: Gary Pavlis, Dept. of Earth and Atmos Sci, Indiana University
"""
from mspasspy.ccore.utility import MsPASSError, ErrorSeverity
import pandas as pd
from obspy import UTCDateTime


[docs]def load_css30_arrivals( db, filename, attribute_names=[ "evid", "source_lat", "source_lon", "source_depth", "source_time", "mb", "ms", "sta", "phase", "iphase", "delta", "seaz", "esaz", "residual", "time", "deltim", ], ): """ Loads an ascii table of arrival time data extracted from an antelope (css3.0) database. The default format of the table is that produced by a shell script linked to this function that has the (current) name of dbexport_to_mongodb.csh (that name is likely to change before a release). The names are locked to css3.0 attribute names that form the argument list to a program called dbselect. Users of other database can easily simulate this in sql by listing the same attributes in the same order. The names used here are translations of concept to mspass. Note not all of these attributes would always be required and alternative implementation may want to change that list - hence attribute_name is a defaulted parameter. :param db: is a MongoDB database handle. It can be as basic as the return of client('dbname') but it can also be an instance of the mspass Database class. There is no default :param filename: is the string defining a file containing the expected text file with columns in the order defined by attribute_names. :param attribute_names: is the list of MongoDB attribute keys to assign to each column of data. :return: MongoDB InsertManyResult object. This can be used, for example, to get the number of rows inserted in arrival from len(r.insertedIds) where r is the symbol given the return. """ # I can find no documentation saying anything about what exceptions # this function will throw. For now assume it will be run interactively # and errors will be handled by user when the error is automatically # printed by the interpreter. # # panda reader used is for clean white space delimited text. That works # for these attributes ONLY because all never nave spaces. Note this # reader would be dangerous if using an attribute like a comment attribute # that has spaces in the string associated with it's value. It is always # safe here because sta,phase, and iphase never have spaces df = pd.read_table( filename, delim_whitespace=True, header=None, names=attribute_names ) df.reset_index(inplace=True) data_dict = df.to_dict("records") col = db.arrival ret = col.insert_many(data_dict) # ret is an special object returned by mongodb - basically a list of # object ids. We return it for convenience return ret
[docs]def load_css30_sources( db, srcdict, collection="source", attribute_names=["evid", "lat", "lon", "depth", "time"], ): """ Companion to extract_unique_css30_sources to load output of that function into a MongoDB database. The algorithm is cautious and first scans the existing source collection for any matching evids. If it finds any it prints them and does nothing but issue an error message. That was done because this function is only expected to be done interactively for preprocessing. :param db: MongoDB database handle :param srcdict: dict output of extract_unique_css30_sources :param collection: optional alternative collection to save (default is source) :param attribute_names: list of keys to copy from srcdict to database. Note currently no aliases are allowed and we don't test that these are found. We assume the list is consistent with what is posted by extract_unique_css30_events """ dbh = db[collection] # first scan for matches in any of the evids need_to_fix = dict() for evid in srcdict: query = {"evid": evid} n = dbh.count_documents(query) if n > 0: rec = dbh.find_one(query) need_to_fix[evid] = rec if len(need_to_fix) > 0: print( "The following records in collection ", collection, " have matching data for one or more evids", ) print("You must fix the mismatch problem before you can load these data") for k in need_to_fix: print(k, need_to_fix[k]) return None # could have an else here but control comes here unless the we hit # the retun condition above count = 0 for evid in srcdict: rec = srcdict[evid] srcoid = dbh.insert_one(rec).inserted_id # get object id from retval and update this record to set source_id to # the object_id of this record dbh.update_one({"_id": srcoid}, {"$set": {"source_id": str(srcoid)}}) count += 1 return count
[docs]def set_source_id_from_evid( db, collection="arrival", use_immortal_cursor=False, update_all=False ): """ Sets source_id in arrivals by matching evid attributes in the source collection. The concept here is that source_id is the unique key in MsPASS but evid is a foreign key defined when importing data from a CSS3.0 database. The evid key can thus not be trusted so if we override it with a source_id we can guarantee it is unique. This function will only work if the evid attribute is set in documents in the source collection. Currently this can be done with a function called load_css30_sources. :param db: MongoDB handle or MsPASS Database object managing these data. :param collection: collection to search default is arrival, but any collection that has evid defined could be scanned. The algorithm simply tries to match evid and updates documents it finds with the object id of the source record it matches. :param use_immortal_cursor: If true the cursor in the update is made "immortal" meaning it won't time out. This may be necessary if the arrival collection is large. :param update_all: if true the function will force cross referencing and setting every document in arrival. The default (False) updates only documents where the source_id is not yet set. The default behavior, for example, would be the norm when new arrival data is added to a database and need that indexing. :return: tuple of results with mixed content. 0 and 1 are integers defining number of documents processed and the number altered respectively. 2 is a dict keyed by evid with a value equal to the count of the number of documents found and altered with that evid. Component 3 is the dict of the complement to 2; evid keyed dict with the count of the number of documents encountered but which for which evid did not match any document in source. """ dbarr = db[collection] dbsrc = db["source"] if update_all: query = {} else: query = {"source_id": None} if use_immortal_cursor: alldocs = dbarr.find(query, no_cursor_timeout=True) else: alldocs = dbarr.find(query) number_arrivals = 0 number_set = 0 evid_set = dict() not_set = dict() for doc in alldocs: if "evid" in doc: evid = doc["evid"] query = {"evid": evid} n = dbsrc.count_documents(query) if n == 0: if evid in not_set: nset = not_set[evid] nset += 1 not_set[evid] = nset else: not_set[evid] = 1 else: srcrec = dbsrc.find_one(query) source_id = srcrec["source_id"] arroid = doc["_id"] dbarr.update_one({"_id": arroid}, {"$set": {"source_id": source_id}}) number_set += 1 if evid in evid_set: nset = evid_set[evid] nset += 1 evid_set[evid] = nset else: evid_set[evid] = 1 number_arrivals += 1 return [number_arrivals, number_set, evid_set, not_set]
[docs]def extract_unique_css30_sources( filename, attribute_names=[ "evid", "source_lat", "source_lon", "source_depth", "source_time", "mb", "ms", "sta", "phase", "iphase", "delta", "seaz", "esaz", "residual", "time", "deltim", ], ): """ Utility function to scan the same table used by load_css30_arrivals to create a dict of unique sources keyed by the parent css30 database key evid. This will only work if evid is set correctly for each row in the input table. The algorithm used is a bit ugly and exploits the unique key insertion of a python dict container that also behaves like a C++ std::map container. That is, if new data is inserted with a matching key to something already in the container the new data silently replaces the old data. This is a clean way to create a unique set of data keyed by evid, BUT as noted it will create extraneous results if evid value are not consistent with the arrivals (That shouldn't happen if the parent css3.0 database was properly formed). :param filename: text file to scan created form datascope shell command ending with dbselect to produce attributes in the order listedfor attribute_names (the default anyway) :param attribute_name: is a list of keys to assign to each column of data in the input file. Default is for output of a particular shell script ending a unix chain with dbselect with attributes in the order listed. If the dbselect line changes this attribute will need to be changed too. :return: dict keyed by evid of source coordinate data. """ df = pd.read_table( filename, delim_whitespace=True, header=None, names=attribute_names ) df.reset_index(inplace=True) recs = df.to_dict("records") sources = dict() for d in recs: evid = d["evid"] lat = d["source_lat"] lon = d["source_lon"] depth = d["source_depth"] time = d["source_time"] # this depends upon container replacing content when keys match # inefficient but should work sources[evid] = { "evid": evid, "lat": lat, "lon": lon, "depth": depth, "time": time, } return sources
[docs]def parse_snetsta(fname, verbose=False): """ Parses the raw text file in an antelope db.snetsta file. It returns a dict with their sta attribute as the key (their sta is not necessarily the seed sta). Each entry points to a dict with keys net and fsta. We use fsta as that is the field BRTT defines for the seed sta field in snetsta. :param fname: is the snetsta file to be parsed. :param verbose: if True the function will print all stations for which fsta does not match sta """ with open(fname, "r") as fp: staindex = {} for lines in fp.readlines(): x = lines.split() # depend that default is whitespace net = x[0] fsta = x[1] sta = x[2] staindex[sta] = {"fsta": fsta, "net": net} if verbose and fsta != sta: print( "Warning: station in net=", net, " uses altered sta code=", sta, " for sta=", fsta, ) return staindex
[docs]def make_css30_composite_sta(sta, net): """ Small helper for below but of potential general use. Creates a composite station code using antelope rules for mixing sta and net passed as args. Returns the composite name. (eg. AAK_II or XYZTXX) """ n = len(sta) if n <= 3: s = sta + "_" + net else: # Note sta can sometimes be more than 4 characters and the # result of this would make an invalid station code for datascope. # Since we only preserve this as a separate attribute it it is # better to preserve the pieces this way until proven otherwise. s = sta + net return s
[docs]def set_netcode_snetsta(db, staindex, collection="arrival", use_immortal_cursor=False): """ Takes the dict staindex that defines how snetsta defines seed codes for antelope tables and updates a specified collection to add net code and, when necessary, repair station name used by antelope to deal with multiple sites have the same station code (sta) but different network codes. Antelope uses the css3.0 schema which was invented before anyone conceived the need for handling the quantity of data seen today assembled from multiple sources. As a result the schema has a fundamental flaw wherein a channel is defined in css3.0 by two codes we refer to in mspass as "sta" and "chan" (these happen to be the same as those used by antelope). When the SEED standard was defined the committee drafting the standard had the wisdom to realize sta and chan were not sufficient to describe data assembled from multiple sources when station codes were chosen independently by network operators. Hence, they adopted the idea of adding a network (net) and location (loc) code to tag all data. Hence all seed and miniseed (seed contains metadata as well as data. A subset of seed called miniseed is the norm today which has only data bunched in packets with each packet keyed by net:sta:loc:chan:startime:endtime). All that background is included for users to understand the background to this function. BRTT, the developers of Antelope, recognized the limitations of css3.0 early on but realized the depth of the problem long after their code base was deeply locked into css3.0. Rather than fix the problem right they chose to use a kludge solution that has created a weakness in antelope ever since. To handle duplicate stations they created a composite net:sta key with names like AAK_II and composite channel names (for loc codes) like BHZ_00. Things might have been better had they made all sta keys this composite, but because users are locked into sta codes for a variety of reason they elected to retain sta and use the composite ONLY IF a duplicate sta was present. That method works fine for largely static data in fixed network operations (their primary customers) but is a huge mess for research data sets bringing in data from multiple sources. Hence, for mspass we need to get rid of anything remotely linked to snetsta and schanloc as cleanly as possible. This function is aimed at fixing snetsta problems. The function works off an index passed as staindex created by a companions function called "parse_snetsta". This function scans the collection it is pointed at (default is arrival, but any collection containing "sta" as an attribute can be handled) and looks for matching entries for the "fsta" field in snetsta. When it finds a match it adds the net code it finds in the index, corrects the sta code (expanded in a moment), and sets a new attribute "css30_sta". The function handles composite names defined for sta by a simple algorithm that duplicates the way antelope handles duplicate station codes. If the station code is 3 characters or less the name is created in the form sta_net (e.g. sta='AAK' and net='II' will yield AAK_II). If the sta code is 4 characters long the css30_sta is of the simple concatenation ofthe two strings (e.g. sta='HELL' and net='XX' yields 'HELLXX'). If the code found is longer than 4 characters it is assumed to be already a composite created with the same rules. In that case the name is split to pieces and compared to the index fsta and net values. If they match the composite is used unaltered for the css30_sta name and the fsta and net values are added as sta and net in the update of the document to which they are linked. The function also tests documents it processes for an existing net code. If it finds one it tests for the existence of css30_sta. If that attribute is already defined it skips updating that document. If css30_sta is not defined it is the only field updated. This was done as a way to use a scan of the site collection as an alternative to setting net and then using this to set css30_sta as an alias for sta for some operations. :param db: is a mongodb database handle. It can either be the plain result of a client('dbname') definition or a mspass.Database class instance. :param staindex: is a dict returned by parse_snetsta. The key of this index is a sta name it function expects to find in an arrivals document. the dict value is a dict with two keys: fsta and net. net is the seed network code and fsta is the expected seed station code. Updates replace the sta field in arrivals with the fsta values and put the original sta value in arrival_sta. :param use_immortal_cursor: If true the cursor in the update is made "immortal" meaning it won't time out. This may be necessary if the arrival collection is large. :param collection: MongoDB collection to scan to apply snetsta correction. (default is arrival) :return: tuple with these contentss: 0 - number of documents scanned 1 - number update 2 - set of stations names with no match in the snetsta index. (these will often need additonal attention through another mechanism) :rtype: tuple """ col = db[collection] print(col.count_documents({})) updaterec = {} nprocessed = 0 nset = 0 sta_not_found = set() # not quite sure how mongo handles this with a large collection. We may need to define # chunks to be processed. if use_immortal_cursor: dbcursor = col.find({}, no_cursor_timeout=True) else: dbcursor = col.find({}) for doc in dbcursor: doc_needs_update = True nprocessed += 1 id = doc["_id"] dbsta = doc["sta"] if dbsta in staindex: updaterec.clear() xref = staindex[dbsta] net = xref["net"] sta = xref["fsta"] if "net" in doc: if "css30_sta" in doc: # We use this case to detect previously processed data # so we simply skip them doc_needs_update = False else: # Assume if we land here something else set net and # we just need to set css30_sta sta = doc["sta"] net = doc["net"] css30sta = make_css30_composite_sta(sta, net) updaterec["css30_sta"] = css30sta doc_needs_update = True else: if len(dbsta) <= 3: updaterec["css30_sta"] = dbsta + "_" + net elif len(dbsta) == 4: updaterec["css30_sta"] = dbsta + net else: # We use this name directly in this case. We don't # force the antelope method to allow flexibility # it is possible a user creates an snetsta entry by hand # and this will handle that correctly. updaterec["css30_sta"] = dbsta updaterec["net"] = net updaterec["sta"] = sta doc_needs_update = True else: sta_not_found.add(dbsta) doc_needs_update = False # do not do this here as we have nothing to change if doc_needs_update: # for testing just print these # print(updaterec) col.update_one({"_id": id}, {"$set": updaterec}) nset += 1 return tuple([nprocessed, nset, sta_not_found])
[docs]def set_netcode_from_site( db, collection="arrival", time_key=None, use_immortal_cursor=False, stations_to_ignore=None, ): """ This function scans a MongoDB collection that is assumed to contain a "sta" for station code to be cross referenced with metadata stored in the site collection. (The default collection is arrival, but this could be used for any table for which sta needs to be regularized to a seed net:sta pair.) The entire objective of the function is to add missing seed net codes. The algorithm used here is the most basic possible and looks only for a match of sta and and option time matched with a site's operation interval defined by a starttime to endtime time interval. I returns two lists of problem children that have to be handled separately: (1) a set container of station names that have no matching value in the current site collection, and (2) a set container with a tuple of [net, sta, startime, endtime] values of net:sta combinations that are ambiguous. They are defined as ambiguous if a common sta code appears in two or more networks. Both cases need to be handled by subsequent processing. The first, requires scrounging for the station metadata. A good foundation for that is obspy's get_stations function. The ambiguous station code problem requires rules and special handling. That will be deal with in tools planned for the near future but which do not exist at the time this function was finalized. The key idea in both cases is to use the output of this function to guide additional processing with two different workflows aimed at building a clean database to initiate processing. :param db: is a MongoDB database handle. It can be as basic as the return of client('dbname') but it can also be an instance of the mspass Database class. There is no default. :param collection: MongoDB collection to be updated. Processing keys on the data with key="sta" and (optional) value of time_key arg. (Default is arrival) :param time_key: is a document key in collection containing a time used to match any site's operational starttime to endtime time window. Default is None which turns off that selection. Ambiguous keys may be reduce in large datasets by using a time key. :param use_immortal_cursor: If true the cursor in the update is made "immortal" meaning it won't time out. This may be necessary if the arrival collection is large. :param stations_to_ignore: is expected to be a set container listing any station codes to be ignored in processing. This can be used to reduce processing overhead or handle sites where net is null and not needed at all. Default is None which turns this option off. :return: Summary of results in the form of a 4 element tuple. :rtype: tuple with the following contents: 0 - number of documents processed 1 - number of documents updated in this run 2 - set container of tuples with content (net,sta,starttime,endtime) of all documents matching the reference sta code but having different net codes or time spans. These data are stored in a set container to easily sort out the unique combinations. 3 - set container of station codes that found in collection that had no matching entry in the site collection. """ dbh = db[collection] dbsite = db["site"] ambiguous_sta = set() not_found_set = set() # This is kind of an ugly way to handle null ignore list but is functional if stations_to_ignore == None: stations_to_ignore = set() query = {} updaterec = {} nprocessed = 0 nupdates = 0 if use_immortal_cursor: dbcursor = dbh.find({}, no_cursor_timeout=True) else: dbcursor = dbh.find({}) for doc in dbcursor: nprocessed += 1 id = doc["_id"] if not ("sta" in doc): print( "set_netcode_from_site (WARNING): document with id=", id, " has no sta attribute -skipped", ) continue sta = doc["sta"] if sta in stations_to_ignore: continue if "net" in doc: # silently skip records for which net is already defined for efficiency continue query.clear() query["sta"] = {"$eq": sta} if time_key != None: if time_key in doc: time = doc[time_key] # site has starttime and endtime defined so no need to test for # their presence. query["starttime"] = {"$lt": time} query["endtime"] = {"$gt": time} else: # for now just log this as an error print("Time key=", time_key, " not found in document for sta=", sta) found = dbsite.find(query) nfound = dbsite.count_documents(query) if nfound == 1: x = found.next() updaterec.clear() net = x["net"] updaterec["net"] = net dbh.update_one({"_id": id}, {"$set": updaterec}) nupdates += 1 elif nfound > 1: # this dependence on set uniqueness approach may be # a bit inefficient for large collections. Perhaps should # test before add for x in found: net = x["net"] st = x["starttime"] et = x["endtime"] val = tuple([net, sta, st, et]) ambiguous_sta.add(val) else: not_found_set.add(sta) return [nprocessed, nupdates, ambiguous_sta, not_found_set]
[docs]def set_netcode_time_interval( db, sta=None, net=None, collection="arrival", starttime=None, endtime=None, time_filter_key="time", use_immortal_cursor=False, ): """ Forces setting net code for data with a given station code within a specified time interval. Arrivals measured with Antelope using Datascope to manage the catalog data has a disconnect with seed's required net:sta to specify a unique seismic observatory (what we call site). The css3.0 schema does not include the net attribute. This collides with modern stationxml files used to deliver instrument metadata because they are always indexed by net:sta. This function is one a collection of functions to set the net field in a collection (normally arrival but could be other collections with ambiguous sta keys). This particular function is intended as a last resort to more or less force setting net to a single value for all documents matching the sta key. There is an optional time range that can be used to fix ambiguous entries like some TA stations that were adopted and nothing changed but the net code on a particular day. :param db: Database handle (can be a raw top level MongoDB database pointer or a mspass Database class :param sta: station name to use as key to set net :param net: net code to which all data matching sta will be set (a warning is issued if this field in the retrieved docs is already set) :param collection: MongoDB collection to be updated (default is arrival) :param starttime: starting time period of (optional) time filter. (default turns this off) Must be UTCDateTime :param endtime: end of time period for (optional) time selection. (default is off) Note a MsPASSError will be thrown if endtime is not defined by starttime is or vice versa. Must be a UTCDateTime object :param time_filter_key: key used to access document entry to use for time range test (startime<=time<=endtime). :param use_immortal_cursor: If true the cursor in the update is made "immortal" meaning it won't time out. This may be necessary if the arrival collection is large. :return: number of documents updated. """ basemessage = "set_netcode_time_interval: " if sta == None or net == None: print(basemessage + "you must specify sta and net as required parameters") dbarr = db[collection] query = {"sta": sta} if starttime == None or endtime == None: if starttime != None: raise MsPASSError( basemessage + "usage error - starttime defined but endtime was left null" ) elif endtime != None: raise MsPASSError( basemessage + "usage error - endtime defined but starttime was left null" ) else: if not isinstance(starttime, UTCDateTime): raise MsPASSError( basemessage + "usage error - starttime must be specified as an obspy UTCDateTime object" ) if not isinstance(endtime, UTCDateTime): raise MsPASSError( basemessage + "usage error - endtime must be specified as an obspy UTCDateTime object" ) tse = starttime.timestamp tee = endtime.timestamp query[time_filter_key] = {"$gte": tse, "$lte": tee} n = dbarr.count_documents(query) if n == 0: print( basemessage + "the following query returned no documents in collection" + collection ) print(query) else: count = 0 if use_immortal_cursor: curs = dbarr.find(query, no_cursor_timeout=True) else: curs = dbarr.find(query) for doc in curs: if "net" in doc: print( basemessage + "WARNING found document with net code set to ", doc["net"], ) # this check is required for robustness when time filter is off if time_filter_key in doc: print("Problem document time=", UTCDateTime(doc[time_filter_key])) print("Setting net in this document to requested net code=", net) oid = doc["_id"] updaterec = {"net": net} dbarr.update_one({"_id": oid}, {"$set": updaterec}) count += 1 return count
[docs]def find_null_net_stations(db, collection="arrival"): """ Return a set container of sta fields for documents with a null net code (key=net). Scans collection defined by collection argument. """ dbcol = db[collection] net_not_defined = set() curs = dbcol.find() for doc in curs: if not "net" in doc: sta = doc["sta"] net_not_defined.add(sta) return net_not_defined
[docs]def find_duplicate_sta(db, collection="site"): """ Scans collection requested (site is default be can be run on channel) for combinations of net:sta where the sta is not unique. This can cause problem in associating data from css3.0 databases that do not have a net code for station names. Returns a dict with sta names as key and a set container with net codes associated with that sta. The algorithm used here is a large memory algorithm but considered acceptable since the total number of instruments in a data set is not currently expected to be a limiting factor. If the collection were huge it would be better to use a well crafted incantation to mongodb. :param db: mspass Database handle or just a plain MongoDB database handle. (mspass Database is a child of MongoDBs top level database handle) :param collection: string defining the collection name to scan (default is site) :return: dict of stations with nonunique sta codes as the key. The value returned in each field is a set container with net codes that use that sta code. """ dbcol = db[collection] allsta = {} curs = dbcol.find() # we do a brute force scan through the collection for rec in curs: if "net" in rec: net = rec["net"] sta = rec["sta"] if sta in allsta: val = allsta[sta] # note this works only because the set container behaves # like std::set and adds of duplicates do nothing val.add(net) allsta[sta] = val else: stmp = set() stmp.add(net) allsta[sta] = stmp else: sta = rec["sta"] print( "find_duplicate_sta (WARNING): ", collection, " collection has an undefined net code for station", sta, ) print("This is the full document from this collection") print(rec) # Now we have allsta with all unique station names. We just look # for ones where the size of the set is not 1 trouble_sta = {} for x in allsta: s = allsta[x] if len(s) > 1: trouble_sta[x] = s return trouble_sta
[docs]def find_unique_sta(db, collection="site"): """ This function is the complement to find_duplicate_sta. It returns a list of stations with one and only one matching net code. Stations in that list can normally be forced in arrival assuming the arrival data are not disjoint with the station data. :param db: mspass Database handle or just a plain MongoDB database handle. (mspass Database is a child of MongoDBs top level database handle) :param collection: string defining the collection name to scan (default is site) :return: dict with sta as keys an net as unique net code """ dbcol = db[collection] allsta = {} curs = dbcol.find() # we do a brute force scan through the collection for rec in curs: if "net" in rec: net = rec["net"] sta = rec["sta"] if sta in allsta: val = allsta[sta] # note this works only because the set container behaves # like std::set and adds of duplicates do nothing val.add(net) allsta[sta] = val else: stmp = set() stmp.add(net) allsta[sta] = stmp else: sta = rec["sta"] print( "find_duplicate_sta (WARNING): ", collection, " collection has an undefined net code for station", sta, ) print("This is the full document from this collection") print(rec) # Now we have allsta with all unique station names. We just look # for ones where the size of the set is exactly 1 unique_sta = {} for x in allsta: s = allsta[x] if len(s) == 1: # this is a crazy construct but the only way I could # figure out how to extact the value from the one element set for y in s: y = y unique_sta[x] = y return unique_sta
[docs]def check_for_ambiguous_sta( db, stalist, collection="arrival", verbose=False, verbose_attributes=None ): """ Scans db.collection for any station in the list of station codes defined by the list container stalist. By default it reports only the count of the number of hits for each sta. If verbose is set it prints a summary of every record it finds - warning this can get huge so always run verbose=false first to find the scope of the problem. :param db: Mongodb database pointer - can also be a mspass Database class object. :param stalist: required list of station names to be checked :param verbose: turn on verbose mode (see overview paragraph) :param verbose_attributes: list container of database attributes to be printed in verbose mode. Note is default is None and the function will exit immediately with an error message if verbose is enable and this list is not defined. The current version blindly assumes every document found will contain these attributes. It will abort if an attribute is not defined. """ if verbose and (verbose_attributes == None): print("check_for_ambiguous_sta: usage error") print( "if verbose mode is turned on you need to supply python list of db attributes to print" ) return None if verbose: to_print = [] for key in verbose_attributes: to_print.append(key) print(to_print) else: print("station count") dbhandle = db[collection] need_checking = [] for sta in stalist: query = {"sta": sta} nsta = dbhandle.count_documents(query) if verbose and nsta > 0: curs = dbhandle.find(query) for rec in curs: to_print = [] for key in verbose_attributes: to_print.append(rec[key]) print(to_print) else: print(sta, nsta) if nsta > 0: need_checking.append(tuple([sta, nsta])) return need_checking
[docs]def set_arrival_by_time_interval( db, sta=None, allowed_overlap=86401.0, use_immortal_cursor=False, verbose=False ): """ Sets the net code in an arrival collection for occurrences of a specified station code using the net code for a given time interval defined in the site collection. This function only works reliably if the time intervals of the duplicate station names do overlap in time. The type example this function is useful is station adoption by other networks of TA net code station. Those stations typically change nothing except the network code at some specific time, although often the channel configuration also changes (e.g. many N4 sites turned on 100 sps data as H channels). This function is a bit like a related function called set_netcode_time_interval but here the site time intervals for a specified sta field are used. set_netcode_time_interval is brutal and will blindly set all matching sta in an optional time range to a specified value. This function is preferable when the site collection has an unambiguous net defined by time invervals that do not overlap. The function will throw an exception and do nothing if the time intervals returned by the match to sta overlap. :param db: mspasspy.db.Database handle (requires arrival and site collections) :param sta: station code in arrival to be updated. :param allowed_overlap: There are lots of errors in stationxml files that cause bogus one day overlap. This defaults to 1 day but it can be set larger or smaller. :param use_immortal_cursor: If true the cursor in the update is made "immortal" meaning it won't time out. This may be necessary if the arrival collection is large. :param verbose: if true prints a bunch a few messages. Silent (default) otherwise :return: count of number of documents updated. """ if sta == None: raise MsPASSError( "Missing required parameter sta=station code to repair", "Fatal" ) dbarr = db.arrival dbsite = db.site query = {"sta": sta} if use_immortal_cursor: curs = dbsite.find(query, no_cursor_timeout=True).sort("starttime", 1) else: curs = dbsite.find(query).sort("starttime", 1) # First make sure we don't have any overlapping time periods n = 0 for doc in curs: if n == 0: lastend = doc["endtime"] lastnet = doc["net"] else: stime = doc["starttime"] if stime + allowed_overlap < lastend: net = doc["net"] message = "Overlapping time intervals found in site. \n " message += "Record 1 has net={lnet} and endtime {etime}\n" message += "Record 2 has net={net} and starttime {stime}" message = message.format( lnet=lastnet, net=net, etime=str(UTCDateTime(lastend)), stime=str(UTCDateTime(stime)), ) raise MsPASSError(message, "Fatal") lastend = doc["endtime"] lastnet = doc["net"] n += 1 curs.rewind() for doc in curs: net = doc["net"] arquerry = dict() arquerry["sta"] = sta arquerry["time"] = {"$gte": doc["starttime"], "$lte": doc["endtime"]} if verbose: print( "Setting net=", net, " for time interval=", UTCDateTime(doc["starttime"]), UTCDateTime(doc["endtime"]), ) nset = dbarr.count_documents(arquerry) print("Setting net code to ", net, " in ", nset, " documents") arcursor = dbarr.find(query) for ardoc in arcursor: # print(ardoc['sta'],UTCDateTime(ardoc['time'])) id = ardoc["_id"] dbarr.update_one({"_id": id}, {"$set": {"net": net}})
[docs]def force_net(db, sta=None, net=None): """ Forces all entries in arrival collection matching input station code sta to input value of parameter net. This is the most brute force solution to set a net code, but is often the right tool. Kind of like every toolbox needs a hammer. :param db: Database handle (function hits only arrival collection) :param sta: station to set :param net: network code to set sta entries to :return: number or documents set. """ if sta == None or net == None: raise MsPASSError( "force_net (usage error): missing required sta and net argument", "Fatal" ) dbarr = db.arrival query = {"sta": sta} curs = dbarr.find(query) n = 0 for doc in curs: oid = doc["_id"] dbarr.update_one({"_id": oid}, {"$set": {"net": net}}) n += 1 return n