Source code for cdms2.cdscan

#!/usr/bin/env python
from __future__ import print_function

import sys
import getopt
import cdms2
from cdms2.grid import lookupArray
from cdms2.axis import calendarToTag, tagToCalendar
from cdms2.cdmsobj import CdFromObject, CdString, CdScalar, CdFloat, CdDouble, CdShort, CdInt, CdLong
import numpy
import string
import cdtime
import os.path
import copy
from cdms2 import cdmsNode
import re
from functools import reduce
from cdms2.error import CDMSError
from collections import OrderedDict
from six import string_types
usage = """Usage:
    cdscan [options] <files>

    Scan a list of files producing a CDMS dataset in XML representation. See Notes below
    for a more complete explanation.

Arguments:

    <files> is a list of file paths to scan. The files can be listed in any order, and may
    be in multiple directories.  A file may also be a CDML dataset (.xml or .cdml), in
    which case the dataset(s) and files are combined into a new dataset.

Options:

    -a alias_file: change variable names to the aliases defined in an alias file.
                   Each line of the alias file consists of two blank separated
                   fields: variable_id alias. 'variable_id' is the ID of the variable
                   in the file, and 'alias' is the name that will be substituted for
                   it in the output dataset. Only variables with entries in the alias_file
                   are renamed.

    -c calendar:   either "gregorian", "proleptic_gregorian", "julian", "noleap", or "360_day". Default:
                   "gregorian". This option should be used with caution, as it will
                   override any calendar information in the files.

    -d dataset_id: dataset identifier. Default: "none"

    -e newattr:    Add or modify attributes of a file, variable, or
           axis. The form of 'newattr' is either:

           'var.attr = value' to modify a variable or attribute, or
           '.attr = value' to modify a global (file) attribute.

           In either case, 'value' may be quoted to preserve spaces
           or force the attribute to be treated as a string. If
           'value' is not quoted and the first character is a
           digit, it is converted to integer or
           floating-point. This option does not modify the input
           datafiles. See notes and examples below.

    --exclude var,var,...
                   Exclude specified variables. The argument
                   is a comma-separated list of variables containing no blanks.
                   In contrast to --exclude-file, this skips the variables regardless
                   of the file(s) in which they are contained, but processes other
                   variables in the files.
                   Also see --include.

    --exclude-file pattern
                   Exclude files with a basename matching the regular expression pattern.
                   In contrast to --exclude, this skips the file entirely. Multiple patterns
                   may be listed by separating with vertical bars (e.g. abc|def ). Note
                   that the match is to the initial part of the basename. For example, the
                   pattern 'st' matches any basename starting with 'st'.

    -f file_list:  file containing a list of absolute data file names, one per
                   line. <files> arguments are ignored.

    --forecast     generate a description of a forecast dataset.
                   This is not compatible with the -i, -r, -t, or -l options.
                   A file can contain data for exactly one forecast; its
                   forecast_reference_time (aka run time, analysis time, starting time,
                   generating time, tau=0 time) is specified by the nbdate,nbsec variables.
                   Each file's time axis will be interpreted as the forecast_period (aka
                   tau, the interval from the forecast_reference_time to the current time)
                   regardless of its units, standard_name, or other attributes.

    -h:            print a help message.

    -i time_delta: scan time as a 'linear' dimension. This is useful if the time dimension
                   is very long. The argument is the time delta, a float or integer.  For
                   example, if the time delta is 6 hours, and the reference units are
                   "hours since xxxx", set the interval delta to 6.  The default value is
                   the difference of the first two timepoints.

    --ignore-open-error:
                   Ignore open errors. Print a warning and continue.

    --include var,var,...
                   Only include specified variables in the output. The argument
                   is a comma-separated list of variables containing no blanks.
                   Also see --exclude.

    --include-file pattern
                   Only include files with a basename matching the regular expression pattern.
                   In contrast to --include, this skips files entirely if they do not
                   match the pattern. Multiple patterns
                   may be listed by separating with vertical bars (e.g. abc|def ). Note
                   that the match is to the initial part of the basename. For example, the
                   pattern 'st' matches any basename starting with 'st'.

    -j:        scan time as a vector dimension. Time values are listed
           individually. Turns off the -i option.

    -l levels:     list of levels, comma-separated. Only specify if files are partitioned by
                   levels.

    -m levelid:    name of the vertical level dimension. The default is the name of the
                   vertical level dimension

    --notrim-lat:  Don't trim latitude values (in degrees) to the range [-90..90]. By default
           latitude values are trimmed.

    -p template:   Compatibility with pre-V3.0 datasets. 'cdimport -h' describes template strings.

    -q:            quiet mode

    -r time_units: time units of the form "<units> since yyyy-mm-dd hh:mi:ss", where
                   <units> is one of "year", "month", "day", "hour", "minute", "second".
                   Trailing fields may be omitted. The default is the units of the first
                   time dimension found.

    -s suffix_file: Append a suffix to variable names, depending on the directory
                   containing the data file.  This can be used to distinguish variables
                   having the same name but generated by different models or ensemble
                   runs. 'suffix_file' is the name of a file describing a mapping between
                   directories and suffixes.  Each line consists of two blank-separated
                   fields: 'directory' 'suffix'. Each file path is compared to the
                   directories in the suffix file. If the file path is in that directory
                   or a subdirectory, the corresponding suffix is appended to the variable
                   IDs in the file. If more than one such directory is found, the first
                   directory found is used. If no match is made, the variable ids are not
                   altered.  Regular expressions can be used: see the example in the Notes
                   section.

    -t timeid:     id of the partitioned time dimension. The default is the name of the time
                   dimension.

    --time-linear tzero,delta,units[,calendar]
                   Override the time dimensions(s) with a linear time dimension. The arguments are
                   a comma-separated list:

                   tzero is the initial time point, a floating-point value.
                   delta is the time delta, floating-point.
                   units are time units as specified in the [-r] option.
                   calendar is optional, and is specified as in the [-c] option. If omitted, it
                     defaults to the value specified by [-c], otherwise as specified in the file.

                   Example: --time-linear '0,1,months since 1980,noleap'

                   Note (6) compares this option with [-i] and [-r]

    --var-locate 'var,file_pattern':
                   Only scan a variable if the basename of the file matches the pattern. This
                   may be used to resolve duplicate variable errors. var and file_pattern are
                   separated by a comma, with no blanks.

                   var is the name of the variable
                   file_pattern is a regular expression following the Python re module syntax.e

                   Example: to scan variable ps from files starting with the string 'ps_':
                     --var-locate 'ps,ps_.*'

    -x xmlfile:    XML filename. By default, output is written to standard output.

Example:

    cdscan -c noleap -d test -x test.xml [uv]*.nc
    cdscan -d pcmdi_6h -i 0.25 -r 'days since 1979-1-1' *6h*.ctl

Notes:

    (1) The files can be in netCDF, GrADS/GRIB, HDF, or DRS format, and can be listed in
    any order. Most commonly, the files are the result of a single experiment, and the
    'partitioned' dimension is time. The time dimension of a variable is the coordinate
    variable having a name that starts with 'time' or having an attribute "axis='T'". If
    this is not the case, specify the time dimension with the -t option. The time
    dimension should be in the form supported by cdtime. If this is not the case (or to
    override them) use the -r option.

    (2) The basic form of the command is 'cdscan <files>'. By default, the time values are
    listed explicitly in the output XML. This can cause a problem if the time dimension is
    very long, say for 6-hourly data. To handle this the form 'cdscan -i delta <files>'
    may be used. This generates a compact time representation of the form <start, length,
    delta>. An exception is raised if the time dimension for a given file is not linear.

    (3) Another form of the command is 'cdscan -l lev1,lev2,..,levn <files>'. This asserts
    that the dataset is partitioned in both time and vertical level dimensions. The level
    dimension of a variable is the dimension having a name that starts with "lev", or
    having an attribute "axis=Z". If this is not the case, set the level name with the -m
    option.

    (4) An example of a suffix file:

    /exp/pr/ncar-a  _ncar-a
    /exp/pr/ecm-a   _ecm-a
    /exp/ta/ncar-a  _ncar-a
    /exp/ta/ecm-a   _ecm-a

    For all files in directory /exp/pr/ncar-a or a subdirectory, the corresponding
    variable ids will be appended with the suffix '_ncar-a'.  Regular expressions can be
    used, as defined in the Python 're' module. For example, The previous example can be
    replaced with the single line:

    /exp/[^/]*/([^/]*) _\g<1>

    Note the use of parentheses to delimit a group. The syntax \g<n> refers to the n-th
    group matched in the regular expression, with the first group being n=1. The string
    [^/]* matches any sequence of characters other than a forward slash.

    (5) Adding or modifying attributes with the -e option:

    time.units = "days since 1979-1-1"

    sets the units of all variables/axes to "Days since 1979-1-1". Note
    that since this is done before any other processing is done, it allows
    overriding of non-COARDS time units.

    .newattr=newvalue

    Set the global file attribute 'newattr' to 'newvalue'.

    (6) The [--time-linear] option overrides the time values in the file(s). The resulting
    dimension does not have any gaps. In contrast, the [-i], [-r] options use the specified
    time units (from [-r]), and calendar from [-c] if specified, to convert the file times
    to the new units. The resulting linear dimension may have gaps.

    In either case, the files are ordered by the time values in the files.

    The [--time-linear] option should be used with caution, as it is applied to all the time
    dimensions found.
"""

# Ensure that arrays are fully printed to XML files
numpy.set_printoptions(threshold=numpy.inf)

calendarMap = tagToCalendar

reverseCalendarMap = calendarToTag

attrPattern = re.compile(r'\s*(\w*)\.(\w+)\s*=\s*(.*)$')
cdms2.setNetcdfUseParallelFlag(0)


[docs]def timestamp():
    "Generate a timestamp."
    import time
    y, m, d, h, mi, s, w, dy, ds = time.gmtime(time.time())
    return "%d-%d-%d %d:%d:%d" % (y, m, d, h, mi, s)


[docs]def timeindex(value, units, basetime, delta, calendar):
    """ Calculate (t - basetime)/delu
    where t = reltime(value, units)
    and delu is the time interval (delta, delunits) (e.g., 1 month).
    """
    if units.find(" as ") == -1:
        tval = cdtime.reltime(value, units)
    else:
        tval = cdtime.abstime(value, units)
    newval = tval.torel(basetime, calendar)
    if delta is None:
        return newval.value
    else:
        return newval.value / delta


[docs]def combineKeys(mydict, typedict, timeIsLinear=0,
                referenceDelta=None, forecast=None):
    """ Combine dictionary keys into an axis.
    dict: (i,j) => (path, axisname)
    typedict is either timedict or levdict or fcdict.
    timeIsLinear is true iff time has a linear representation.
    referenceDelta is only used for error checks if timeIsLinear is true.
    """

    global verbose

    # Sort the projected time, level indices
    keys = list(OrderedDict(sorted(mydict.items())).keys())

    axislist = []
    prevend = None
    prevpath = None
    name0 = None
    compressPart = []
    partition = []
    previ = 0
    firstunits = ""
    prevvals = None
    coordToInd = {(None, None): (None, None)}
    linCoordToInd = {(None, None): (None, None)}
    iadj = None
    errorOccurred = 0
    for i0, i1 in keys:
        path, name = mydict[(i0, i1)]
        if name0 is None:
            name0 = name
        values, units, dummy = typedict[(path, name)]
        if firstunits == "":
            firstunits = units
        if prevend is not None and prevend >= i0:
            if prevend >= i1:
                if verbose:
                    print('Warning, file %s, dimension %s contains values in file %s' % (
                        prevpath, name, path), file=sys.stderr)
                if timeIsLinear:
                    iind = lookupArray(prevvals, values[0])
                    jind = lookupArray(prevvals, values[-1])
                else:
                    iind = lookupArray(prevvals, i0)
                    jind = lookupArray(prevvals, i1)
                if len(values) != (jind - iind + 1):
                    raise RuntimeError(
                        'Dimension %s in files %s [len(%s)=%d], %s [len(%s)=%d], is inconsistent' %
                        (name, prevpath, name, (jind - iind + 1), path, name, len(values)))
                coordToInd[(i0, i1)] = (iind, jind)

                prevspart, prevepart = partition[-1]
                linCoordToInd[(i0, i1)] = (
                    prevspart + iind, prevspart + jind + 1)
                continue
            else:                       # Fix partial overlap
                if timeIsLinear:
                    jind = lookupArray(prevvals, values[0])
                else:
                    jind = lookupArray(prevvals, i0)
                if verbose:
                    print('Warning, file %s, dimension %s overlaps file %s, value=%f' % (
                        prevpath, name, path, prevvals[jind]), file=sys.stderr)
                previ, prevj = compressPart[-1]
                prevj = previ + jind
                axislist[-1] = prevvals[0:jind]
                compressPart[-1] = (previ, prevj)
                coordToInd[(prevvals[0], prevvals[-1])] = (previ, prevj)
                previ = prevj

                prevspart, prevepart = partition[-1]
                prevepart = prevspart + jind
                partition[-1] = (prevspart, prevepart)
                linCoordToInd[(prevvals[0], prevvals[-1])
                              ] = (prevspart, prevepart)

        axislist.append(values)
        prevend = i1
        prevpath = path
        prevj = previ + len(values)
        compressPart.append((previ, prevj))
        coordToInd[(i0, i1)] = (previ, prevj)

        if iadj is None:                # partition has to start with 0
            iadj = int(i0)
        spart = int(i0) - iadj
        epart = int(i1) + 1 - iadj
        partition.append((spart, epart))
        linCoordToInd[(i0, i1)] = (spart, epart)
        if timeIsLinear and len(values) != (epart - spart):
            # Find the bad values
            diffs = values[1:] - values[:-1]
            badindices = numpy.compress(
                numpy.not_equal(diffs, referenceDelta), list(range(len(values))))
            badvalues = numpy.take(values, badindices)
            if verbose:
                print("Error: Missing values in %s after times: %s." % (path, str(badvalues)) +
                      " Set delta with the -i option or turn off linear mode with the -j option.",
                      file=sys.stderr)
            errorOccurred = 1

        prevvals = values
        previ = prevj

    fullaxis = numpy.ma.concatenate(axislist)
    return fullaxis, name0, compressPart, coordToInd, firstunits, partition, linCoordToInd, errorOccurred


[docs]def useKeys(mydict, typedict, timeIsLinear=0,
            referenceDelta=None, forecast=None):
    """ Use dictionary keys for an axis.  This is like combineKeys (same arguments, same return values,
    was written by simplifying combineKeys), but this doesn't do nearly so much because this is
    for an axis where there is no splitting across files, hence partitions are not needed.
    dict: (i,j) => (path, axisname)
    typedict is either timedict or levdict or fcdict.
    timeIsLinear is true iff time has a linear representation.
    referenceDelta is only used for error checks if timeIsLinear is true.
    """
    global verbose

    # Sort the projected time, level indices
    keys = list(OrderedDict(sorted(mydict.items())).keys())

    axislist = []
    name0 = None
#    compressPart = []
    compressPart = None
#    partition = []
    partition = None
#    previ = 0
    firstunits = None
#    coordToInd = {(None,None):(None,None)}
#    linCoordToInd = {(None,None):(None,None)}
    coordToInd = None
    linCoordToInd = None
    errorOccurred = 0
    for i0, i1 in keys:
        path, name = mydict[(i0, i1)]
        if name0 is None:
            name0 = name
        values, units, dummy = typedict[(path, name)]
        if firstunits is None:
            firstunits = units
        axislist.append(values)
#        prevj = previ+len(values)
#        coordToInd[(i0,i1)] = (previ, prevj)

    fullaxis = numpy.ma.concatenate(axislist)
    return fullaxis, name0, compressPart, coordToInd, firstunits, partition, linCoordToInd, errorOccurred


[docs]def copyDict(mydict):
    """Copy a dictionary-like object dict to a true dictionary"""
    result = {}
    for key in mydict.keys():
        result[key] = mydict[key]

    return result


[docs]def disambig(name, mydict, num, comparator, value):
    """ Make an unique name from name, wrt to the keys in dictionary dict.
    Try using num first. comparator(value,dict[name]) returns 0 if equal, 1 if not.
    """
    if name not in mydict or not comparator(value, mydict[name]):
        uniqname = name
    else:
        uniqname = '%s_%d' % (name, num)
        if uniqname in mydict and comparator(value, mydict[uniqname]):
            trial_name = uniqname
            for letter in string.lowercase:
                uniqname = '%s_%s' % (trial_name, letter)
                if uniqname not in mydict or not comparator(
                        value, mydict[uniqname]):
                    break
            else:
                raise BaseException(
                    'Cannot make axis name unique: ' + str(name))

    return uniqname


[docs]def compareaxes(axis1, axis2):
    """Return 0 if equal, 1 if not"""
    return ((len(axis1) != len(axis2)) or
            not numpy.ma.allclose(axis1[:], axis2[:]))


[docs]def comparedomains(domain1, domain2):
    """Return 0 if equal, 1 if not"""
    if len(domain1) != len(domain2):
        return 1
    for i in range(len(domain1)):
        item1 = domain1[i]
        item2 = domain2[i]
        if not isinstance(item1, type(item2)):
            return 1
        if isinstance(item1, string_types):
            return item1 != item2
        elif compareaxes(item1, item2):
            return 1
    return 0


[docs]def compareVarDictValues(val1, val2):
    return comparedomains(val1[0], val2[0])


[docs]def cleanupAttrs(attrs):
    for attname in attrs.keys():
        attval = attrs[attname]
        if isinstance(attval, numpy.ndarray):
            if attval.ndim == 0:
                continue
            elif len(attval) == 1:
                attrs[attname] = attval[0]
            else:
                attrs[attname] = str(attval)
    if 'missing_value' in attrs and attrs['missing_value'] is None:
        del attrs['missing_value']
    if 'ndim' in attrs:
        del attrs['ndim']


[docs]def validateAttrs(node):
    """Compare attributes against DTD."""

    global verbose

    if hasattr(node, 'datatype'):
        parenttype = node.datatype
    else:
        parenttype = None
    atts = node.getExternalDict()
    for attname in atts.keys():
        (attval, datatype) = atts[attname]  # (XML value, datatype)
        constraint = node.extra.get(attname)
        if constraint is not None:
            # (CdScalar|CdArray, required type)
            (scaletype, reqtype) = constraint
            if reqtype == CdFromObject:
                reqtype = parenttype
            if reqtype != datatype and datatype == CdString and scaletype == CdScalar:
                if reqtype in (CdFloat, CdDouble) and not isinstance(
                        attval, float):
                    try:
                        attval = float(attval)
                    except BaseException:
                        if verbose:
                            print("Warning: %s=%s should be a float, id=%s" % (
                                attname, attval, node.id), end=' ', file=sys.stderr)
                        try:
                            attval = int(attval)
                            attval = float(attval)
                            if verbose:
                                print("(Recasting)")
                            node.setExternalAttr(attname, attval)
                        except BaseException:
                            if attname in [
                                    'modulo', 'add_offset', 'scale_factor']:
                                if verbose:
                                    print("(Removing)")
                                attdict = node.getExternalDict()
                                del attdict[attname]
                            else:
                                if verbose:
                                    print("")
                elif reqtype in (CdShort, CdInt, CdLong) and not isinstance(attval, int):
                    try:
                        attval = int(attval)
                    except BaseException:
                        if verbose:
                            print("Warning: %s=%s should be an integer, id=%s" % (
                                attname, attval, node.id), end=' ', file=sys.stderr)
                        try:
                            attval = float(attval)
                            attval = int(attval)
                            if verbose:
                                print("(Recasting)")
                            node.setExternalAttr(attname, attval)
                        except BaseException:
                            if verbose:
                                print("")


[docs]def cloneWithLatCheck(axis):
    """Clone an axis, ensuring that latitudes (in degrees) are in the range [-90:90]"""

    global verbose
    global notrimlat

    axisvals = origvals = axis[:]
    if axis.isLatitude() and hasattr(
            axis, "units") and axis.units[0:6].lower() == "degree":
        if notrimlat == 0:
            axisvals = numpy.maximum(-90.0, numpy.minimum(90.0, axisvals))
        if not numpy.ma.allclose(axisvals, origvals) and verbose:
            print(
                "Warning: resetting latitude values: ",
                origvals,
                " to: ",
                axisvals,
                file=sys.stderr)

    b = axis.getBounds()
    mycopy = cdms2.createAxis(copy.copy(axisvals))
    mycopy.id = axis.id
    try:
        mycopy.setBounds(b)
    except CDMSError:
        b = mycopy.genGenericBounds()
        mycopy.setBounds(b)
    for k, v in axis.attributes.items():
        setattr(mycopy, k, v)
    return mycopy


[docs]def addAttrs(fobj, eattrs):
    """Add extra attributes to file/dataset fobj.
    eattrs has the form [(varid,attr,value), (varid,attr,value), ...]
    where if varid is '', set the global attribute."""
    for evar, eattr, evalue in eattrs:
        if evar == '':
            fobj.__dict__[eattr] = evalue
        else:
            varobj = fobj[evar]
            if varobj is not None:
                varobj.__dict__[eattr] = evalue


[docs]def setNodeDict(node, mydict):
    for key in mydict.keys():
        value = mydict[key]
        if (isinstance(value, numpy.integer) or
                isinstance(value, int)):
            datatype = CdLong
        elif (isinstance(value, numpy.floating) or isinstance(value, float)):
            datatype = CdDouble
        else:
            datatype = CdString
        node.attribute[key] = (value, datatype)


[docs]def initialize_filemap(filemap, timedict, levdict, timeid, extendDset, splitOnTime,
                       referenceTime, timeIsLinear, referenceDelta, splitOnLevel,
                       dirlen, overrideCalendar):
    # This function was formerly part of the body of "main".
    # Initialize filemap : varid => (tc0, tc1, lc0, lc1, path, timeid, levid)
    # where tc0 is the first time index relative to the reference time, tc1 the last,
    # lc0 is the first level, lc1 the last, path is the filename, timeid is the id
    # of the time dimension of the variable, levid is the id of the level dimension
    #
    # timedict : (path, timeid) => (timearray, timeunits, calendar)
    #
    # levdict : (path, levelid) => (levelarray, levelunits, None)
    #
    initfilemap = cdms2.dataset.parseFileMap(extendDset.cdms_filemap)
    dsetdirec = extendDset.directory
    for namelist, slicelist in initfilemap:
        for name in namelist:
            var = extendDset[name]
            timeaxis = var.getTime()
            if timeaxis is not None and not overrideCalendar:
                calendar = timeaxis.getCalendar()
            if splitOnTime and timeaxis is not None:
                if hasattr(timeaxis, 'name_in_file'):
                    timeid = timeaxis.name_in_file
                else:
                    timeid = timeaxis.id
                if referenceTime is None:
                    referenceTime = timeaxis.units
                if timeIsLinear in [None, 1]:
                    timeIsLinear = timeaxis.isLinear()
                    if timeIsLinear:
                        if len(timeaxis) > 1:
                            referenceDelta = timeaxis[1] - timeaxis[0]
                        else:
                            referenceDelta = 1.0
                    else:
                        referenceDelta = None
            else:
                timeid = None
            levelaxis = var.getLevel()
            if splitOnLevel and levelaxis is not None:
                if hasattr(levelaxis, 'name_in_file'):
                    levid = levelaxis.name_in_file
                else:
                    levid = levelaxis.id
            else:
                levid = None

            varmaplist = []
            for t0, t1, lev0, lev1, path in slicelist:
                fullpath = os.path.join(dsetdirec, path)
                basepath = fullpath[dirlen:]
                if t0 is not None:
                    tc0 = timeindex(
                        timeaxis[t0],
                        timeaxis.units,
                        referenceTime,
                        referenceDelta,
                        calendar)
                    tc1 = timeindex(timeaxis[t1 - 1],
                                    timeaxis.units,
                                    referenceTime,
                                    referenceDelta,
                                    calendar)
                    if (basepath, timeid, calendar) not in timedict:
                        values = timeaxis[t0:t1]
                        timedict[(basepath, timeid)] = (
                            values, timeaxis.units, calendar)
                else:
                    tc0 = tc1 = None
                if lev0 is not None:
                    lc0 = levelaxis[lev0]
                    lc1 = levelaxis[lev1 - 1]
                    if (basepath, levid, None) not in levdict:
                        values = levelaxis[lev0:lev1]
                        levdict[(basepath, levid)] = (
                            values, levelaxis.units, None)
                else:
                    lc0 = lc1 = None
                varmaplist.append(
                    (tc0, tc1, lc0, lc1, basepath, timeid, levid, calendar))
            if name in filemap:
                filemap[name].extend(varmaplist)
            else:
                filemap[name] = varmaplist

# ------------------------------------------------------------------------


verbose = 1


[docs]def main(argv):

    global verbose
    global notrimlat

    try:
        args, lastargs = getopt.getopt(
            argv[1:], "a:c:d:e:f:hi:jl:m:p:qr:s:t:x:",
            ["include=", "include-file=", "exclude=", "exclude-file=", "forecast", "time-linear=",
             "notrim-lat", "var-locate=", "ignore-open-error"])
    except getopt.error:
        print(sys.exc_info()[1], file=sys.stderr)
        print(usage, file=sys.stderr)
        sys.exit(0)

    calendar = None
    calenkey = None
    timeid = None
    levelid = None
    notrimlat = 0
    referenceTime = None
    referenceDelta = None
    readFromFile = 0
    splitOnTime = 1
    splitOnLevel = 0
    datasetid = "none"
    timeIsLinear = None
    writeToStdout = 1
    templatestr = None
    timeIsVector = None
    modelMapFile = None
    aliasMapFile = None
    overrideCalendar = 0
    extraAttrs = []
    extraDict = {}
    includeList = None
    excludeList = None
    overrideTimeLinear = None
    varLocate = None
    ignoreOpenError = False
    excludePattern = None
    includePattern = None
    forecast = False
    for flag, arg in args:
        if flag == '-a':
            aliasMapFile = arg
        elif flag == '-c':
            calenkey = arg.lower()
            calendar = calendarMap[calenkey]
            overrideCalendar = 1
        elif flag == '-d':
            datasetid = arg
        elif flag == '-e':
            matchObj = attrPattern.match(arg)
            if matchObj is None:
                raise RuntimeError(
                    "Expression must have form '[var].attr=value': %s" % arg)
            matchGroups = matchObj.groups()
            if len(matchGroups) != 3:
                raise RuntimeError(
                    "Expression must have form '[var].attr=value': %s" % arg)
            matchValue = matchGroups[2]
            if len(matchValue) > 0 and (matchValue[0].isdigit(
            ) or matchValue[0] in ['"', "'", "-", "+"]):  # "
                matcheval = eval(matchValue)
            else:
                matcheval = str(matchValue)
            extraAttrs.append((matchGroups[0], matchGroups[1], matcheval))
        elif flag == '--exclude':
            if arg[0] == '-':
                raise RuntimeError("--exclude option requires an argument")
            excludeList = arg.split(',')
        elif flag == '--exclude-file':
            excludePattern = arg
        elif flag == '-f':
            readFromFile = 1
            filelistpath = arg
        elif flag == '--forecast':  # experimental forecast mode
            forecast = True
            splitOnTime = 0
            splitOnLevel = 0
        elif flag == '-h':
            print(usage)
            sys.exit(0)
        elif flag == '-i':
            splitOnTime = 1
            referenceDelta = float(arg)
            timeIsLinear = 1
            timeIsVector = None
        elif flag == '--ignore-open-error':
            ignoreOpenError = True
        elif flag == '--include':
            if arg[0] == '-':
                raise RuntimeError("--include option requires an argument")
            includeList = arg.split(',')
        elif flag == '--include-file':
            includePattern = arg
        elif flag == '-j':
            timeIsVector = 1
            timeIsLinear = None
        elif flag == '-l':
            splitOnLevel = 1
            levelstr = arg.split(',')
            levellist = list(map(float, levelstr))
            levels = numpy.array(levellist)
            levels = numpy.sort(levels)
        elif flag == '-m':
            levelid = arg
            args.append(('-e', '%s.axis=Z' % levelid))  # Add axis=Z attribute
        elif flag == '--notrim-lat':
            notrimlat = 1
        elif flag == '-p':
            templatestr = arg
        elif flag == '-q':
            verbose = 0
        elif flag == '-r':
            splitOnTime = 1
            referenceTime = arg
        elif flag == '-s':
            modelMapFile = arg
        elif flag == '-t':
            splitOnTime = 1
            timeid = arg
            args.append(('-e', '%s.axis=T' % timeid))  # Add axis=T attribute
        elif flag == '--time-linear':
            targlist = arg.split(',')
            ttzero = float(targlist[0])
            tdelta = float(targlist[1])
            tunits = targlist[2].strip()
            if len(targlist) == 4:
                tcalendar = targlist[3].strip()

            else:
                tcalendar = None
            overrideTimeLinear = [ttzero, tdelta, tunits, tcalendar]
        elif flag == '--var-locate':
            if varLocate is None:
                varLocate = {}
            vname, pattern = arg.split(',')
            varLocate[vname] = pattern
        elif flag == '-x':
            writeToStdout = 0
            xmlpath = arg

    # If overriding time, process time as vector so that no gaps result
    if overrideTimeLinear is not None:
        timeIsVector = 1
        timeIsLinear = None
        if overrideCalendar == 1:
            overrideTimeLinear[3] = calenkey

    if verbose:
        print('Finding common directory ...')
    if readFromFile:
        f = open(filelistpath)
        lastargs = f.readlines()
        f.close()

    # Ignore blank paths
    realargs = []
    for arg in lastargs:
        sarg = arg.strip()
        if len(sarg) > 0:
            realargs.append(sarg)
    lastargs = realargs

    # Split lastargs into files and datasets
    fileargs = []
    dsetargs = []
    for arg in lastargs:
        base, suffix = os.path.splitext(arg)
        if suffix.lower() in ['.xml', '.cdml']:
            dsetargs.append(arg)
        else:
            fileargs.append(arg)

    # Generate a list of pathnames for datasets
    dsetfiles = []
    for path in dsetargs:
        dset = cdms2.open(path)
        if not hasattr(dset, 'cdms_filemap'):
            raise RuntimeError(
                'Dataset must have a cdms_filemap attribute: ' + path)
        if not hasattr(dset, 'directory'):
            raise RuntimeError(
                'Dataset must have a directory attribute: ' + path)
        dsetdirec = dset.directory
        initfilemap = cdms2.dataset.parseFileMap(dset.cdms_filemap)
        for namelist, slicelist in initfilemap:
            for t0, t1, lev0, lev1, path in slicelist:
                dsetfiles.append(os.path.join(dsetdirec, path))
    augmentedArgs = fileargs + dsetfiles

    # Find the common directory
    directory = os.path.commonprefix(augmentedArgs)
    firstpath = augmentedArgs[0][len(directory):]
    if not os.path.isfile(os.path.join(directory, firstpath)):
        dnew = os.path.dirname(directory)
        if len(dnew) > 0 and directory[len(dnew)] == '/':
            directory = dnew + '/'
        else:
            directory = dnew
    if verbose:
        print('Common directory:', directory)

    dirlen = len(directory)

    if templatestr is not None:
        if os.path.isabs(templatestr):
            templatestr = templatestr[dirlen:]
        templatere, ignore = cdms2.cdmsobj.templateToRegex(templatestr)
        template = re.compile(templatere + '$')
    else:
        template = None

    axisdict = {}
    vardict = {}
    filemap = {}
    timedict = {}
    levdict = {}
    fcdict = {}
    global_attrs = None
    fctau0 = None

    if modelMapFile is not None:
        mfile = open(modelMapFile)
        modelMap = {}
        modelDirs = []
        for line in mfile.readlines():
            mdirec, model = line.split()
            modelMap[mdirec] = model
            modelDirs.append(mdirec)
        mfile.close()

    if aliasMapFile is not None:
        afile = open(aliasMapFile)
        aliasMap = {}
        for line in afile.readlines():
            if line[0] not in ["'", '"']:  # "
                varid, alias = line.split()
            else:
                dummy, varid, alias = line.split(line[0])
                alias = alias.strip()
            aliasMap[varid] = alias
        afile.close()

    # Save extra attribute information for new axes
    for evar, eattr, evalue in extraAttrs:
        if evar == '':
            continue
        if evar in extraDict:
            curval = extraDict[evar]
            curval.append((eattr, evalue))
        else:
            extraDict[evar] = [(eattr, evalue)]

    # ------------------------------------------------------------------------
    # Initialize dictionaries if adding to an existing dataset
    if verbose and len(dsetargs) > 0:
        print('Scanning datasets ...')
    for extendPath in dsetargs:
        if verbose:
            print(extendPath)
        extendDset = cdms2.open(extendPath)

        # Add/modify attributes
        addAttrs(extendDset, extraAttrs)

        # Copy the global attribute dictionary if necessary. Note that copy.copy
        # can't be used here, since .attributes is now a 'fake' dictionary.
        if global_attrs is None:
            global_attrs = copyDict(extendDset.attributes)

        # Initialize filemap : varid => (tc0, tc1, lc0, lc1, path, timeid, levid)
        # where tc0 is the first time index relative to the reference time, tc1 the last,
        # lc0 is the first level, lc1 the last, path is the filename, timeid is the id
        # of the time dimension of the variable, levid is the id of the level dimension
        #
        # timedict : (path, timeid) => (timearray, timeunits, calendar)
        #
        # levdict : (path, levelid) => (levelarray, levelunits, None)
        #
        initialize_filemap(filemap, timedict, levdict, timeid, extendDset, splitOnTime,
                           referenceTime, timeIsLinear, referenceDelta, splitOnLevel,
                           dirlen, overrideCalendar)

        # axisdict : id => transient_axis
        #   for non-partitioned axes only
        #
        tempmap = {}
        for axis in list(extendDset.axes.values()):
            if not ((splitOnTime and (axis.isTime() or axis.id == timeid)) or
                    (splitOnLevel and (axis.isLevel() or axis.id == levelid))):
                axis = cloneWithLatCheck(axis)
                if axis.id in axisdict:
                    currentaxis = axisdict[axis.id]

                    # Check that the axis has the same length and values as the saved value. If not,
                    # create an unambiguous name in the axis dictionary.
                    if compareaxes(axis, currentaxis):
                        sepname = disambig(
                            axis.id, axisdict, len(axis), compareaxes, axis)
                        axis.name_in_file = axis.id
                        oldid = axis.id
                        axis.id = sepname
                        axisdict[sepname] = axis
                        tempmap[oldid] = sepname
                else:
                    axisdict[axis.id] = axis

        # vardict : varid => [domain, attributeDict, typecode]
        #   where domain = [axis_or_id, axis_or_id,...]
        #   and axis_or_id is the id of a partitioned dimension, or
        #   the transient axis object associated with a non-partitioned dimension
        #

        for var in list(extendDset.variables.values()):
            tempdomain = []
            for id in var.getAxisIds():
                if id in tempmap:
                    id = tempmap[id]
                if id in axisdict:
                    tempdomain.append(axisdict[id])
                else:
                    axis = extendDset[id]
                    if hasattr(axis, 'name_in_file'):
                        id = axis.name_in_file
                    tempdomain.append(id)
            varattrs = copyDict(var.attributes)
            vardict[var.id] = [tempdomain, varattrs, var.typecode()]

        extendDset.close()

        # end of loop "for extendPath in dsetargs"

    # ------------------------------------------------------------------------
    if verbose:
        print('Scanning files ...')

    boundsmap = {}                      # boundsmap : varid => timebounds_id
    boundsdict = {}                     # Same as vardict for time bounds
    for path in fileargs:
        path = path.strip()

        # Check if the path is included
        if includePattern is not None:
            base = os.path.basename(path)
            mobj = re.match(includePattern, base)
            if mobj is None:
                continue

        # Check if the path is excluded
        if excludePattern is not None:
            base = os.path.basename(path)
            mobj = re.match(excludePattern, base)
            if mobj is not None:
                continue

        if verbose:
            print(path)
        try:
            f = cdms2.open(path)
        except BaseException:
            if not ignoreOpenError:
                raise RuntimeError('Error opening file ' + path)
            else:
                print(
                    'Warning: cannot open file, skipping: %s' %
                    path, file=sys.stderr)
                continue

        # Add/modify attributes
        addAttrs(f, extraAttrs)

        # Determine the variable ID suffix, if any
        varsuffix = None
        if modelMapFile is not None:
            for direc in modelDirs:
                mo = re.match(direc, path)
                if mo is not None:
                    suffixPattern = modelMap[direc]

                    def gensuffix(m, mo=mo):
                        i = int(m.group(1))
                        return mo.group(i)
                    varsuffix = re.sub(r'\\g<(\d)>', gensuffix, suffixPattern)
                    break

        # Copy the global attribute dictionary if necessary. Note that copy.copy
        # can't be used here, since .attributes is now a 'fake' dictionary.
        if global_attrs is None:
            global_attrs = copyDict(f.attributes)

        basepath = path[dirlen:]
        if template is not None and template.match(basepath) is None:
            if verbose:
                print('Warning: path %s does not match template %s' % (
                    basepath, templatestr), file=sys.stderr)

        # Find time boundary variables
        boundsids = []
        if splitOnTime:
            tmpdict = {}
            for axisname in f.axes.keys():
                axis = f[axisname]
                # was if axis.isTime() and hasattr(axis, 'bounds'):
                if axis.isTime() and (axis.getBounds() is not None):
                    tmpdict[axis.bounds] = 1
            boundsids = list(tmpdict.keys())

        # For forecasts, get the time at which the forecast begins (tau=0) which
        # is nbdate,nbsec
        if forecast:
            # f('nbdate') is numpy.int32 which gets truncated
            nbdate = numpy.int(f('nbdate'))
            nbsec = f('nbsec')
            # hopefully nbsec<(seconds per day)=86400<100000
            fctau0 = nbdate * 100000 + nbsec
            fctau0time = cdtime.abstime(nbdate, "day as %Y%m%d")
            fctau0time = fctau0time.add(
                nbsec, cdtime.Seconds)  # fctau0 as type comptime
            fc_time_attrs = []

        varnames = list(f.variables.keys())

        # Try to force all axes to be included, but only small ones, length<100.
        # This section was motivated by a need to preserve the cloud axes isccp_prs,isccp_tau.
        # If we ever need to preserve longer axes as well, we could create one
        # variable per axis...
        crude_var_axes = [[ax[0] for ax in var.getDomain()]
                          for var in list(f.variables.values())]
        var_axes = set().union(*crude_var_axes)
        other_axes = list(set(f.axes.values()) - var_axes)
        if len(other_axes) > 0:
            other_axes = [ax for ax in other_axes if len(ax) < 100]
            other_axes.sort(key=(lambda ax: ax.id))
            axisvar = cdms2.createVariable(numpy.ones([len(ax) for ax in other_axes]),
                                           axes=other_axes, id='allaxesdummy')
            # all CdmsObj objects have this attribute, but for unknown
            axisvar.autoApiInfo = None
            # reasons datasetnode.dump() fails trying to dump this attribute's
            # default value (jfp)
            varnames.append(axisvar.id)
        # ...try to force all axes to be considered

        varnames = sorted(varnames)
        for varname in varnames:

            # If --var-locate is specified for the variable, match the basename
            # before processing
            if varLocate is not None and varname in varLocate:
                varpattern = varLocate[varname]
                base = os.path.basename(path)
                mobj = re.match(varpattern, base)
                if mobj is None:
                    continue

            # was var = f.variables[varname]
            if varname == 'allaxesdummy':
                var = axisvar
            else:
                var = f.variables[varname]

            # Reset the variable ID to any specified alias
            if aliasMapFile is not None:
                varalias = aliasMap.get(var.id)
                if varalias is not None:
                    var.name_in_file = var.id
                    var.id = varalias
                    varname = varalias

            # Append a suffix to the variable ID, if applicable
            if varsuffix is not None:
                if not hasattr(var, 'name_in_file'):
                    var.name_in_file = var.id
                var.id += varsuffix
                varname += varsuffix

            # [timestart, timeend, levstart, levend, path, timeid, levid, calendar, fctau0]
            varentry = [None] * 9
            varentry[4] = basepath
            varentry[8] = fctau0

            # Generate a temporary domain entry, and
            # create axis dictionary entries.
            domain = var.getDomain()
            if forecast:
                tempdomain = ['fctau0']
            else:
                # List of axis names and/or objects (if not partitioned)
                tempdomain = []
            for axis, start, length, truelen in domain:
                if (splitOnTime and (axis.isTime() or axis.id == timeid)) or \
                   (splitOnLevel and (axis.isLevel() or axis.id == levelid)):
                    tempdomain.append(axis.id)
                elif forecast and (axis.isTime() or axis.id == timeid):
                    # time axis isn't split but needs special treatment for
                    # forecasts
                    tempdomain.append(axis.id)
                    fc_time_attrs.append(axis.attributes)
                else:
                    axis = cloneWithLatCheck(axis)  # Transient copy
                    if axis.id in axisdict:
                        currentaxis = axisdict[axis.id]

                        # Check that the axis has the same length and values as the saved value. If not,
                        # create an unambiguous name in the axis dictionary.
                        if compareaxes(axis, currentaxis):
                            sepname = disambig(
                                axis.id, axisdict, len(axis), compareaxes, axis)
                            axis.name_in_file = axis.id
                            axis.id = sepname

                            # Fix boundary variable names if using suffixes.
                            if varsuffix is not None and hasattr(
                                    axis, 'bounds'):
                                axis.bounds += varsuffix
                            axisdict[sepname] = axis
                        else:
                            axis = currentaxis
                    else:
                        # Fix boundary variable names if using suffixes.
                        if varsuffix is not None and hasattr(axis, 'bounds'):
                            axis.bounds += varsuffix
                        axisdict[axis.id] = axis
                    tempdomain.append(axis)

            # Create a dictionary entry for the variable if not already there.
            if var.id in boundsids:
                boundsattrs = copyDict(var.attributes)
                boundsdict[var.id] = [tempdomain, boundsattrs, var.typecode()]
                continue                # Don't set a filemap entry until axes are sorted out
            elif var.id not in vardict:
                varattrs = copyDict(var.attributes)
                if varsuffix is not None or aliasMapFile is not None:
                    varattrs['name_in_file'] = var.name_in_file
                vardict[var.id] = [tempdomain, varattrs, var.typecode()]
            else:
                currentdomain, attrs, tcode = vardict[var.id]
                if comparedomains(currentdomain, tempdomain):
                    sepname = disambig(
                        var.id, vardict, var.size(), compareVarDictValues, (tempdomain, None))
                    saveid = var.id
                    varname = var.id = sepname
                    varattrs = copyDict(var.attributes)
                    var.name_in_file = varattrs['name_in_file'] = saveid
                    vardict[sepname] = [tempdomain, varattrs, var.typecode()]

            # Create a filemap entry for this variable/file, if split on time
            # or forecast
            axisids = [x[0].id for x in var.getDomain()]
            if splitOnTime or forecast:
                vartime = None
                if timeid is not None:
                    if timeid in axisids:
                        vartime = f.axes.get(timeid)
                    else:
                        if verbose:
                            print(
                                'Warning, time axis %s not found, -t option ignored' %
                                timeid, file=sys.stderr)
                if vartime is None:
                    vartime = var.getTime()
                if vartime is not None:
                    if not overrideCalendar:
                        calendar = vartime.getCalendar()
                    if referenceTime is None:
                        referenceTime = vartime.units
                    if verbose and not forecast:
                        print('Setting reference time units to', referenceTime)
                    if timeIsLinear is None and timeIsVector is None:
                        timeIsLinear = (referenceTime[0].lower().split() in
                                        ['hour',
                                         'hours',
                                         'minute',
                                         'minutes',
                                         'second',
                                         'seconds'])
                        if timeIsLinear and verbose:
                            print('Setting time representation to "linear"')  # '
                    if timeIsLinear and referenceDelta is None:
                        if len(vartime) > 1:
                            time1 = timeindex(
                                vartime[1], vartime.units, referenceTime, None, calendar)
                            time0 = timeindex(
                                vartime[0], vartime.units, referenceTime, None, calendar)
                            referenceDelta = time1 - time0
                        else:
                            referenceDelta = 1
                        if verbose:
                            print('Setting time delta to', referenceDelta)

#                    starttime = vartime[0]
#                    endtime = vartime[-1]
                    startindex = timeindex(
                        vartime[0],
                        vartime.units,
                        referenceTime,
                        referenceDelta,
                        calendar)
                    endindex = timeindex(
                        vartime[-1], vartime.units, referenceTime, referenceDelta, calendar)
                    if forecast:
                        # split on forecast, hence no split on time
                        varentry[0] = None
                        varentry[1] = None
                        referenceTime = None
                    else:
                        varentry[0] = startindex
                        varentry[1] = endindex
                    varentry[5] = vartime.id
                    varentry[7] = calendar

                    if (basepath, vartime.id) not in timedict:
                        values = vartime[:]
                        timedict[(basepath, vartime.id)] = (
                            values, vartime.units, calendar)

            if splitOnLevel:
                varlev = None
                if (levelid is not None) and (levelid in axisids):
                    varlev = f.axes.get(levelid)
                if varlev is None:
                    varlev = var.getLevel()
                if varlev is not None:
                    startlev = varlev[0]
                    if isinstance(startlev, numpy.ndarray):
                        startlev = startlev[0]
                    endlev = varlev[-1]
                    if isinstance(endlev, numpy.ndarray):
                        endlev = endlev[0]
                    varentry[2] = startlev
                    varentry[3] = endlev
                    varentry[6] = varlev.id

                    if (basepath, varlev.id, None) not in levdict:
                        values = varlev[:]
                        levdict[(basepath, varlev.id)] = (
                            values, varlev.units, None)

            if forecast:
                if (basepath, 'fctau0') not in fcdict:
                    fcdict[(basepath, 'fctau0')] = ([fctau0], "", None)

            if varname in filemap:
                filemap[varname].append(tuple(varentry))
            else:
                filemap[varname] = [tuple(varentry)]

            # Set boundsmap : varid => timebounds_id
            # was if splitOnTime and vartime is not None and hasattr(vartime,
            # "bounds") and not boundsmap.has_key(varname):
            if splitOnTime and vartime is not None and (vartime.getBounds() is not None) and\
                    varname not in boundsmap:
                boundsmap[varname] = vartime.bounds

            # End of loop "for varname in varnames"

        f.close()
        # End of loop "for path in fileargs"

    # ------------------------------------------------------------------------

    # Generate varindex, by combining variable names with
    # identical varentry values.
    varindex = []
#    varnames = sorted(filemap.keys())
    varnames = list(OrderedDict(sorted(filemap.items())).keys())

    for varname in varnames:
        varentry = sorted(filemap[varname])

        for varindexname, varindexvalue in varindex:
            if varentry == varindexvalue:
                varindexname.append(varname)
                break
        else:
            varindex.append(([varname], varentry))

    # If a variable is not a function of one of the partitioned dimensions,
    # no indexing is necessary: just read from the first file containing it.
    for varlist, slicelist in varindex:
        slice0 = slicelist[0]
        a, b, c, d, path0, timename, levname, calen, fctau0 = slice0
        if (a, b, c, d, fctau0) == (None, None, None, None, None):
            del slicelist[1:]

    # Change times to constant units
    sameCalendars = 1                   # True iff all time calendars are the same
    prevcal = None
    if forecast:
        # The data files' time axis is interpreted to be tau time, i.e. the forecast_period.
        # Find the axis, and remember it in timedict.
        for key in timedict.keys():
            values, units, calendar = timedict[key]
            if prevcal is not None and calendar != prevcal:
                sameCalendars = 0
            prevcal = calendar
            if units.find(" as ") == -1:
                time0 = cdtime.reltime(values[0], units)
            else:
                time0 = cdtime.abstime(values[0], units)
            offset = time0.torel(units, calendar).value  # normally will be 0
            values = values + offset - values[0]
            # Switch units from "normal" time such as "days since 2001-06-01"
            # to "basic" time such as "days", which makes sense for a
            # forecast_period.
            baslen = time0.units.find(' since ')
            basic_units = time0.units[0:baslen]  # e.g. 'days'
            fc_units = basic_units
            timedict[key] = (values, fc_units, calendar)
    else:       # splitOnTime is true
        for key in timedict.keys():
            values, units, calendar = timedict[key]
            if prevcal is not None and calendar != prevcal:
                sameCalendars = 0
            prevcal = calendar
            if units.find(" as ") == -1:
                time0 = cdtime.reltime(values[0], units)
            else:
                time0 = cdtime.abstime(values[0], units)
            offset = time0.torel(referenceTime, calendar).value
            values = values + offset - values[0]
            timedict[key] = (values, referenceTime, calendar)

    if sameCalendars and prevcal is not None:
        calenkey = reverseCalendarMap[calendar]

    if forecast:
        # For forecasts, make sure that the above has made all timedict values the same.
        # >>> It's conceivable that different forecasts will have different time (really, tau)
        # >>> axes.  If so, at this point we'll want to merge and mask all the time values, so
        # >>> that all variables can have the same time axis..  For now, just raise an error
        # >>> if there are time axis differences at this point.
        values0, units0, calendar0 = timedict[list(timedict.keys())[0]]

        timedict_same = all([((values0 == values1).all() and units0 == units1 and calendar0 == calendar1)
                             for (values1, units1, calendar1) in list(timedict.values())])
        if not timedict_same:
            raise CDMSError(
                'cdscan is confused about times for a forecast set')
        # Earlier we had saved all the time axis attributes.  Keep whatever
        # they have in common.
        fc_time_attr = fc_time_attrs[0]
        # go through all time attributes (each a dictionary)
        for fcta in fc_time_attrs:
            for attrn in fc_time_attr.keys():
                if attrn not in fcta:
                    # key attrn isn't in all time attributes
                    del fc_time_attr[attrn]
                elif fcta[attrn] != fc_time_attr[attrn]:
                    # not all time attributes have the same value for attrn
                    del fc_time_attr[attrn]
        # At this point fc_time_attr is the dictionary of those time attributes which are common to
        # all time axes encountered (in the context of a forecast dataset).
        # Finally, add the appropriate standard_name to it, if we haven't already gotten one from
        # the data file.  If the file has anything other than 'forecast_period', it's wrong, but
        # we'll stick with it anyway.
        if 'standard_name' not in fc_time_attr.keys():
            fc_time_attr['standard_name'] = 'forecast_period'

    # Create partitioned axes
    axes = []
    # varkey => (timeCoordToInd, levCoordToInd)
    masterCoordToInd = {}
    errorOccurred = 0
    for varlist, varentry in varindex:

        # Project time, level indices
        timeproj = {}
        levproj = {}
        fctproj = {}
        for time0, time1, lev0, lev1, path, timename, levname, calendar, fctau0 in varentry:
            if timename is not None:
                timeproj[(time0, time1)] = (path, timename)
            if levname is not None:
                try:
                    levproj[(lev0, lev1)] = (path, levname)
                except BaseException:
                    print('Cannot hash level %s range (%f,%f)' % (
                        levname, lev0, lev1), file=sys.stderr)
                    print(type(lev0), file=sys.stderr)
                    raise
            if fctau0 is not None:
                fctproj[(fctau0, fctau0)] = (path, 'fctau0')

        # and combine the projected indices into axes
        timeCoordToInd = None
        timelinCoordToInd = None
        if splitOnTime and timename is not None:
            fullaxis, name, partition, timeCoordToInd, units, opartition, timelinCoordToInd, errflag = \
                combineKeys(timeproj, timedict, timeIsLinear, referenceDelta)
            axes.append((varlist, fullaxis, name, partition, timeCoordToInd, units, opartition,
                         timelinCoordToInd, calendar))
            if errflag:
                errorOccurred = 1
        levCoordToInd = None
        if splitOnLevel and levname is not None:
            fullaxis, name, partition, levCoordToInd, units, opartition, levlinCoordToInd, errflag = \
                combineKeys(levproj, levdict)
            axes.append(
                (varlist,
                 fullaxis,
                 name,
                 partition,
                 levCoordToInd,
                 units,
                 opartition,
                 levlinCoordToInd,
                 None))
            if errflag:
                errorOccurred = 1
        fcCoordToInd = None
        if forecast:
            fullaxis, name, partition, fcCoordToInd, units, opartition, fclinCoordToInd, errflag = \
                combineKeys(fctproj, fcdict, forecast=forecast)
            axes.append(
                (varlist,
                 fullaxis,
                 name,
                 partition,
                 fcCoordToInd,
                 units,
                 opartition,
                 fclinCoordToInd,
                 None))
            if errflag:
                errorOccurred = 1
            # i.e., if time is in this variable's domain.
            if len(timeproj) > 0:
                # The useKeys call is like combineKeys, except that it's for a variable not partitioned
                # among files.  It just sets up axis data and (in the context of this variable loop)
                # propagates what's in timedict to every variable with time in
                # its domain.
                fullaxis, name, partition, timeCoordToInd, units, opartition, timelinCoordToInd, errflag = \
                    useKeys(timeproj, timedict, timeIsLinear, referenceDelta)
                axes.append((varlist, fullaxis, name, partition, timeCoordToInd, units, opartition,
                             timelinCoordToInd, calendar))
                if errflag:
                    errorOccurred = 1

        masterCoordToInd[varlist[0]] = (
            timeCoordToInd, levCoordToInd, timelinCoordToInd, fcCoordToInd)

    if errorOccurred:
        raise RuntimeError(
            'Error(s) determining axis values - see previous message(s)')

    # Eliminate duplicate axes
    axes2 = []
    for vlist1, axis1, name1, partition1, coordToInd1, units1, opartition1, linCoordToInd1, calen1 in axes:
        for vlist2, axis2, name2, partition2, coordToInd2, units2, opartition2, linCoordToInd2, calen2 in axes2:
            if len(axis1) == len(axis2) and name1 == name2 and partition1 == partition2 and units1 == units2 and \
                    numpy.ma.allclose(axis1, axis2) == 1 and calen1 == calen2:
                vlist2.extend(vlist1)
                break
        else:
            axes2.append((copy.copy(vlist1), axis1, name1, partition1, coordToInd1, units1, opartition1,
                          linCoordToInd1, calen1))

    # For each axis described by axis2, disambiguate its name, create the axis
    # object, etc.
    assignedBounds = {}
    for vlist, axis, name, partition, coordToInd, units, opartition, linCoordToInd, calendar in axes2:
        # print vlist, coordToInd
        uniqname = disambig(name, axisdict, len(axis), compareaxes, axis)
        axisobj = cdms2.createAxis(axis)
        axisobj.name_in_file = name
        axisobj.id = uniqname
        axisobj.units = units
        # For forecasts, give the time axis some saved attributes.
        if forecast and axisobj.isTime():
            for attr in fc_time_attr.keys():
                if not hasattr(axisobj, attr):
                    setattr(axisobj, attr, fc_time_attr[attr])
        if timeIsLinear and axisobj.isTime():
            axisobj.partition = numpy.ma.ravel(numpy.ma.array(opartition))
            axisobj.length = axisobj.partition[-1] - axisobj.partition[0]
            mopartition = numpy.array(opartition)
            partition_length = numpy.sum(mopartition[:, 1] - mopartition[:, 0])
            if partition_length < axisobj.length:
                axisobj.partition_length = partition_length
        elif partition is not None:
            axisobj.partition = numpy.ma.ravel(numpy.ma.array(partition))
        if axisobj.isTime():
            axisobj.calendar = reverseCalendarMap[calendar]
        # axisobj.reference_partition = str(numpy.ma.ravel(numpy.ma.array(opartition)))
        if uniqname not in axisdict:
            axisdict[uniqname] = axisobj
        for varname in vlist:
            domain, attributes, tcode = vardict[varname]
            for i in range(len(domain)):
                item = domain[i]
                if isinstance(item, string_types) and item == name:
                    domain[i] = axisobj

        # Add bounds variables to vardict, varindex
        if axisobj.isTime():
            # 'Representative' variable having this time axis
            reprVar = vlist[0]
            if reprVar in boundsmap:
                boundsname = boundsmap[reprVar]
                boundsinfo = boundsdict[boundsname]
                boundsattrs = boundsinfo[1]
                if uniqname != name:
                    boundsattrs['name_in_file'] = boundsname
                    boundsname = uniqname + '_bnds'
                if boundsname not in assignedBounds:
                    axisobj.bounds = boundsname
                    for varids, ranges in varindex:
                        if reprVar in varids:
                            varids.append(boundsname)
                    tmpdom = boundsinfo[0]
                    if isinstance(tmpdom[1], string_types):
                        bndsobj = tmpdom[0]
                        boundsdomain = (bndsobj, axisobj)
                    else:
                        bndsobj = tmpdom[1]
                        boundsdomain = (axisobj, bndsobj)
                    vardict[boundsname] = (
                        boundsdomain, boundsinfo[1], boundsinfo[2])
                    assignedBounds[boundsname] = 1

    # Collapse like indices in filemap. For example, transform
    # [x,[[0,10,-,-,file1], [0,10,-,-,file2]]] into
    # [x,[[0,10,-,-,file1]]]
    # This occurs for variables such as time boundaries, which are
    # often duplicated in different files.
    cdms_filemap_list = []
    duplicatevars = {}
    for varindexname, varindexvalue in varindex:
        timeCoordToInd, levCoordToInd, linCoordToInd, fcCoordToInd = masterCoordToInd[
            varindexname[0]]
        newslicedict = {}
        for time0, time1, lev0, lev1, path, timename, levname, calendar, fctau0 in varindexvalue:
            if timeCoordToInd is not None:
                if timeIsLinear:
                    i0, i1 = linCoordToInd[(time0, time1)]
                else:
                    i0, i1 = timeCoordToInd[(time0, time1)]
            else:
                i0 = i1 = None
            if levCoordToInd is not None:
                j0, j1 = levCoordToInd[(lev0, lev1)]
            else:
                j0 = j1 = None
            if (i0, i1, j0, j1, fctau0) in newslicedict:
                currentpath = newslicedict[(i0, i1, j0, j1, fctau0)]
                if tuple(varindexname) not in duplicatevars:
                    duplicatevars[tuple(varindexname)] = (currentpath, path)
            else:
                newslicedict[(i0, i1, j0, j1, fctau0)] = path
#        keys = sorted(newslicedict.keys())
        keys = list(OrderedDict(sorted(newslicedict.items())).keys())

        newslicelist = []
        for i0, i1, j0, j1, fctau0 in keys:
            path = newslicedict[(i0, i1, j0, j1, fctau0)]
            newslicelist.append([i0, i1, j0, j1, fctau0, path])
        cdms_filemap_list.append([varindexname, newslicelist])

    # Check if any duplicated variables are a function of longitude or latitude.
    # Raise an exception if so.
    illegalvars = []
    for varlist in duplicatevars.keys():
        for varname in varlist:
            if (excludeList is not None) and (varname in excludeList):
                continue
            dom, attrs, tcode = vardict[varname]
            for axisobj in dom:
                if axisobj.isLatitude() or axisobj.isLongitude():
                    path1, path2 = duplicatevars[varlist]
                    illegalvars.append((varname, path1, path2))
    if len(illegalvars) > 0:
        raise RuntimeError(
            "Variable '%s' is duplicated, and is a function of lat or lon: files %s, %s" %
            illegalvars[0])

    if verbose and len(list(duplicatevars.values())) > 0:
        print('Duplicate variables:', file=sys.stderr)
        for varlist in duplicatevars.keys():
            path1, path2 = duplicatevars[varlist]
            print('\t', varlist, '\t', path1, '\t', path2, file=sys.stderr)

    # Generate the cdms_filemap attribute
    cdms_filemap = str(cdms_filemap_list)
    cdms_filemap = cdms_filemap.replace(' ', '')
    cdms_filemap = cdms_filemap.replace('None', '-')
    cdms_filemap = cdms_filemap.replace('"', '')  # "
    cdms_filemap = cdms_filemap.replace("'", '')

    # Dump to XML
    datasetnode = cdmsNode.DatasetNode(datasetid)
    global_attrs['cdms_filemap'] = cdms_filemap
    global_attrs['directory'] = directory
    if sameCalendars and calenkey is not None:
        global_attrs['calendar'] = calenkey
    elif 'calendar' in global_attrs:
        del global_attrs['calendar']
    cleanupAttrs(global_attrs)
    # datasetnode.setExternalDict(global_attrs)
    setNodeDict(datasetnode, global_attrs)
    validateAttrs(datasetnode)

    timeWasOverridden = 0
    keys = list(OrderedDict(sorted(axisdict.items())).keys())
    for key in keys:
        axis = axisdict[key]
        tcode = axis.typecode()
        if tcode in [numpy.float32, numpy.float, numpy.int16,
                     numpy.int32, numpy.int, numpy.intc, numpy.int8]:
            tcode = numpy.sctype2char(tcode)
        cdtype = cdmsNode.NumericToCdType[tcode]
        node = cdmsNode.AxisNode(axis.id, len(axis), cdtype)

        # Override the time axis as a linear axis
        if axis.isTime() and (overrideTimeLinear is not None):
            ttzero = overrideTimeLinear[0]
            ttdelta = overrideTimeLinear[1]
            axis.units = overrideTimeLinear[2]
            if overrideTimeLinear[3] is None:
                axis.calendar = reverseCalendarMap[axis.getCalendar()]
            else:
                axis.calendar = overrideTimeLinear[3]
            linearnode = cdmsNode.LinearDataNode(ttzero, ttdelta, len(axis))
            node.setLinearData(linearnode)
            if verbose:
                if timeWasOverridden == 0:
                    print("Overriding values for axis '%s'" % axis.id)
                else:
                    print(
                        'Warning, overriding more than one time axis (%s)' %
                        axis.id, file=sys.stderr)
            timeWasOverridden = 1

        # Represent time as linear axis using time values in the file
        elif axis.isTime() and timeIsLinear:
            reference_length = axis.partition[-1] - axis.partition[0]
            linearnode = cdmsNode.LinearDataNode(
                axis[0], referenceDelta, reference_length)
            node.setLinearData(linearnode)
        else:
            try:
                node.setData(axis[:])
            except cdms2.cdmsNode.NotMonotonicError:
                if verbose:
                    print('Warning: Axis values for axis %s are not monotonic:' % axis.id, axis[
                        :], file=sys.stderr)
                    print('Warning: Resetting axis %s values to:' % axis.id, numpy.arange(
                        len(axis)), file=sys.stderr)
                node.setData(numpy.arange(len(axis)))
        axisattrs = copyDict(axis.attributes)

        # Make sure that new axes have attribute mods
        if key in extraDict:
            for eattr, evalue in extraDict[key]:
                axisattrs[eattr] = evalue
        cleanupAttrs(axisattrs)
        # node.setExternalDict(axisattrs)
        setNodeDict(node, axisattrs)
        validateAttrs(node)
        datasetnode.addId(axis.id, node)

#    keys = sorted(vardict.keys())
#    keys = vardict.keys()
#    keys = list(keys).sort()
    keys = list(OrderedDict(sorted(vardict.items())).keys())
    for key in keys:
        if (includeList is not None) and (key not in includeList):
            continue
        if (excludeList is not None) and (key in excludeList):
            continue
        domain, attrs, tcode = vardict[key]
        if tcode in [numpy.float32, numpy.float, numpy.int16,
                     numpy.int32, numpy.int, numpy.intc, numpy.int8]:
            tcode = numpy.sctype2char(tcode)
        domainNode = cdmsNode.DomainNode()
        cdtype = cdmsNode.NumericToCdType[tcode]
        node = cdmsNode.VariableNode(key, cdtype, domainNode)
        cleanupAttrs(attrs)
        # node.setExternalDict(attrs)
        setNodeDict(node, attrs)
        validateAttrs(node)
        for axis in domain:
            if hasattr(axis, 'length'):
                length = axis.length
            else:
                length = len(axis)
            try:
                elemnode = cdmsNode.DomElemNode(axis.id, 0, length)
            except AttributeError:
                print('Axis %s for variable %s does not have attribute "id"' % (
                    repr(axis), key), file=sys.stderr)
            if hasattr(axis, 'partition_length'):
                elemnode.setExternalAttr(
                    'partition_length', axis.partition_length)
            domainNode.add(elemnode)
        datasetnode.addId(key, node)

    # Add the Conventions attribute if not present
    conventions = datasetnode.getExternalAttr('Conventions')
    if conventions is None:
        datasetnode.setExternalAttr('Conventions', '')
    if templatestr is not None:
        datasetnode.setExternalAttr('template', templatestr)

    # Add/modify history
    history = datasetnode.getExternalAttr('history')
    if history is None:
        history = ""
    stringargv = reduce(lambda x, y: x + ' ' + y, argv)
    stringtime = "\n[%s] " % timestamp()
    if len(stringargv) <= 256:
        history += stringtime + stringargv
    else:
        history += stringtime + stringargv[:256] + " ..."
    datasetnode.setExternalAttr('history', history)

    # datasetnode.validate()
    if writeToStdout:
        datasetnode.dump()
    else:
        datasetnode.dump(xmlpath)
        if verbose:
            print(xmlpath, 'written')


# ------------------------------------------------------------------------
if __name__ == '__main__':
    main(sys.argv)
    try:
        from mpi4py import MPI
        comm = MPI.Comm.Get_parent()
        comm.send('done', dest=0)
    except BaseException:
        pass
Source code for cdms2.cdscan

Table of Contents

Search