Source code for rios.calcstats

This module creates pyramid layers and calculates statistics for image
files. Much of it was originally for ERDAS Imagine files but should work 
with any other format that supports pyramid layers and statistics

# This file is part of RIOS - Raster I/O Simplification
# Copyright (C) 2012  Sam Gillingham, Neil Flood
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <>.

import os
import warnings
import numpy
from osgeo import gdal
from . import cuiprogress
from .rioserrors import ProcessCancelledError

# When calculating overviews (i.e. pyramid layers), default behaviour
# is controlled by these
dfltOverviewLvls = os.getenv('RIOS_DFLT_OVERVIEWLEVELS')
if dfltOverviewLvls is None:
    DEFAULT_OVERVIEWLEVELS = [4, 8, 16, 32, 64, 128, 256, 512]
    DEFAULT_OVERVIEWLEVELS = [int(i) for i in dfltOverviewLvls.split(',')]

[docs]def progressFunc(value, string, userdata): """ Progress callback for BuildOverviews """ percent = (userdata.curroffset + (value / userdata.nbands) * 100) userdata.progress.setProgress(percent) if value == 1.0: userdata.curroffset = userdata.curroffset + 100.0 / userdata.nbands return not userdata.progress.wasCancelled()
# make userdata object with progress and num bands
[docs]class ProgressUserData(object): pass
[docs]def addPyramid(ds, progress, minoverviewdim=DEFAULT_MINOVERVIEWDIM, levels=DEFAULT_OVERVIEWLEVELS, aggregationType=None): """ Adds Pyramid layers to the dataset. Adds levels until the raster dimension of the overview layer is < minoverviewdim, up to a maximum level controlled by the levels parameter. Uses gdal.Dataset.BuildOverviews() to do the work. """ progress.setLabelText("Computing Pyramid Layers...") progress.setProgress(0) # ensure everything is written to disc first ds.FlushCache() # first we work out how many overviews to build based on the size if ds.RasterXSize < ds.RasterYSize: mindim = ds.RasterXSize else: mindim = ds.RasterYSize nOverviews = 0 for i in levels: if (mindim // i) > minoverviewdim: nOverviews = nOverviews + 1 # Need to find out if we are thematic or continuous. tmpmeta = ds.GetRasterBand(1).GetMetadata() if aggregationType is None: if 'LAYER_TYPE' in tmpmeta: if tmpmeta['LAYER_TYPE'] == 'athematic': aggregationType = "AVERAGE" else: aggregationType = "NEAREST" else: aggregationType = DEFAULT_OVERVIEWAGGREGRATIONTYPE userdata = ProgressUserData() userdata.progress = progress userdata.nbands = ds.RasterCount userdata.curroffset = 0 ds.BuildOverviews(aggregationType, levels[:nOverviews], progressFunc, userdata) if progress.wasCancelled(): raise ProcessCancelledError() # make sure it goes to 100% progress.setProgress(100)
[docs]def findOrCreateColumn(ratObj, usage, name, dtype): """ Returns the index of an existing column matched on usage. Creates it if not already existing using the supplied name and dtype Returns a tupe with index and a boolean specifying if it is a new column or not """ ncols = ratObj.GetColumnCount() for col in range(ncols): if ratObj.GetUsageOfCol(col) == usage: return col, False # got here so can't exist ratObj.CreateColumn(name, dtype, usage) # new one will be last col return ncols, True
gdalLargeIntTypes = set([gdal.GDT_Int16, gdal.GDT_UInt16, gdal.GDT_Int32, gdal.GDT_UInt32]) # hack for GDAL 3.5 and later which suppport 64 bit ints if hasattr(gdal, 'GDT_Int64'): gdalLargeIntTypes.add(gdal.GDT_Int64) gdalLargeIntTypes.add(gdal.GDT_UInt64) gdalFloatTypes = set([gdal.GDT_Float32, gdal.GDT_Float64])
[docs]def addStatistics(ds, progress, ignore=None, approx_ok=False): """ Calculates statistics and adds them to the image Uses gdal.Band.ComputeStatistics() for mean, stddev, min and max, and gdal.Band.GetHistogram() to do histogram calculation. The median and mode are estimated using the histogram, and so for larger datatypes, they will be approximate only. For thematic layers, the histogram is calculated with as many bins as required, for athematic integer and float types, a maximum of 256 bins is used. """ progress.setLabelText("Computing Statistics...") progress.setProgress(0) percent = 0 percentstep = 100.0 / (ds.RasterCount * 2) # 2 steps for each layer # flush the cache. The ensures that any unwritten data is # written to file so we get the right stats. It also # makes sure any metdata is written on HFA. This means # the LAYER_TYPE setting will be picked up by rat.SetLinearBinning() ds.FlushCache() # The GDAL HFA driver has a bug in its SetLinearBinning function, # which was introduced as part of the RFC40 changes. Until # this is fixed and widely distributed, we should disable the use # of RFC40-style techniques for HFA files. driverName = ds.GetDriver().ShortName disableRFC40 = (driverName == 'HFA') for bandnum in range(ds.RasterCount): band = ds.GetRasterBand(bandnum + 1) # fill in the metadata tmpmeta = band.GetMetadata() if ignore is not None: # tell QGIS that the ignore value was ignored band.SetNoDataValue(ignore) tmpmeta["STATISTICS_EXCLUDEDVALUES"] = repr(ignore) # doesn't seem to do anything # get GDAL to calculate statistics - force recalculation. Trap errors usingExceptions = gdal.GetUseExceptions() gdal.UseExceptions() try: if approx_ok and "LAYER_TYPE" in tmpmeta and tmpmeta["LAYER_TYPE"] == "thematic": warnings.warn('WARNING: approx_ok specified for stats but image is thematic (this could be a bad idea)') (minval, maxval, meanval, stddevval) = band.ComputeStatistics(approx_ok) except RuntimeError as e: if str(e).endswith('Failed to compute statistics, no valid pixels found in sampling.'): minval = ignore maxval = ignore meanval = ignore stddevval = 0 else: raise e finally: if not usingExceptions: gdal.DontUseExceptions() percent = percent + percentstep progress.setProgress(percent) tmpmeta["STATISTICS_MINIMUM"] = repr(minval) tmpmeta["STATISTICS_MAXIMUM"] = repr(maxval) tmpmeta["STATISTICS_MEAN"] = repr(meanval) tmpmeta["STATISTICS_STDDEV"] = repr(stddevval) # because we did at full res - these are the default anyway if approx_ok: tmpmeta["STATISTICS_APPROXIMATE"] = "YES" else: tmpmeta["STATISTICS_SKIPFACTORX"] = "1" tmpmeta["STATISTICS_SKIPFACTORY"] = "1" # create a histogram so we can do the mode and median if band.DataType == gdal.GDT_Byte: # if byte data use 256 bins and the whole range histmin = 0 histmax = 255 histstep = 1.0 histCalcMin = -0.5 histCalcMax = 255.5 histnbins = 256 tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct' elif "LAYER_TYPE" in tmpmeta and tmpmeta["LAYER_TYPE"] == 'thematic': # all other thematic types a bin per value histmin = 0 histmax = int(numpy.ceil(maxval)) histstep = 1.0 histCalcMin = -0.5 histCalcMax = maxval + 0.5 histnbins = histmax + 1 tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct' elif band.DataType in gdalLargeIntTypes: histrange = int(numpy.ceil(maxval) - numpy.floor(minval)) + 1 (histmin, histmax) = (minval, maxval) if histrange <= 256: histnbins = histrange histstep = 1.0 tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'direct' histCalcMin = histmin - 0.5 histCalcMax = histmax + 0.5 else: histnbins = 256 tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'linear' histCalcMin = histmin histCalcMax = histmax histstep = float(histCalcMax - histCalcMin) / histnbins elif band.DataType in gdalFloatTypes: histnbins = 256 (histmin, histmax) = (minval, maxval) tmpmeta["STATISTICS_HISTOBINFUNCTION"] = 'linear' histCalcMin = minval histCalcMax = maxval if histCalcMin == histCalcMax: histCalcMax = histCalcMax + 0.5 histnbins = 1 histstep = float(histCalcMax - histCalcMin) / histnbins # Note that the complex number data types are not handled, as I am not sure # what a histogram or a median would mean for such types. userdata = ProgressUserData() userdata.progress = progress userdata.nbands = ds.RasterCount * 2 userdata.curroffset = percent # Get histogram and force GDAL to recalculate it. Note that we use include_out_of_range=True, # which is safe because we have calculated the histCalcMin/Max from the data. hist = band.GetHistogram(histCalcMin, histCalcMax, histnbins, True, approx_ok, progressFunc, userdata) # Check if GDAL's histogram code overflowed. This is not a fool-proof test, # as some overflows will not result in negative counts. histogramOverflow = (min(hist) < 0) # we may use this ratObj reference for the colours below also # may be None if format does not support RATs ratObj = band.GetDefaultRAT() if not histogramOverflow: # comes back as a list for some reason hist = numpy.array(hist) # Note that we have explicitly set histstep in each datatype case # above. In principle, this can be calculated, as it is done in the # float case, but for some of the others we need it to be exactly # equal to 1, so we set it explicitly there, to avoid rounding # error problems. # do the mode - bin with the highest count modebin = numpy.argmax(hist) modeval = modebin * histstep + histmin if band.DataType == gdal.GDT_Float32 or band.DataType == gdal.GDT_Float64: tmpmeta["STATISTICS_MODE"] = repr(modeval) else: tmpmeta["STATISTICS_MODE"] = repr(int(round(modeval))) if ratObj is not None and not disableRFC40: histIndx, histNew = findOrCreateColumn(ratObj, gdal.GFU_PixelCount, "Histogram", gdal.GFT_Real) # write the hist in a single go ratObj.SetRowCount(histnbins) ratObj.WriteArray(hist, histIndx) ratObj.SetLinearBinning(histmin, (histCalcMax - histCalcMin) / histnbins) # The HFA driver still honours the STATISTICS_HISTOBINVALUES # metadata item. If we are recalculating the histogram the old # values will be copied across with the metadata so clobber it if "STATISTICS_HISTOBINVALUES" in tmpmeta: del tmpmeta["STATISTICS_HISTOBINVALUES"] else: # Use GDAL's original metadata interface, for drivers which # don't support the more modern approach tmpmeta["STATISTICS_HISTOBINVALUES"] = '|'.join(map(repr, hist)) + '|' tmpmeta["STATISTICS_HISTOMIN"] = repr(histmin) tmpmeta["STATISTICS_HISTOMAX"] = repr(histmax) tmpmeta["STATISTICS_HISTONUMBINS"] = repr(histnbins) # estimate the median - bin with the middle number middlenum = hist.sum() / 2 gtmiddle = hist.cumsum() >= middlenum medianbin = gtmiddle.nonzero()[0][0] medianval = medianbin * histstep + histmin if band.DataType == gdal.GDT_Float32 or band.DataType == gdal.GDT_Float64: tmpmeta["STATISTICS_MEDIAN"] = repr(medianval) else: tmpmeta["STATISTICS_MEDIAN"] = repr(int(round(medianval))) # set the data band.SetMetadata(tmpmeta) if ratObj is not None and not ratObj.ChangesAreWrittenToFile(): # For drivers that require the in memory thing band.SetDefaultRAT(ratObj) percent = percent + percentstep progress.setProgress(percent) if progress.wasCancelled(): raise ProcessCancelledError() progress.setProgress(100)
[docs]def calcStats(ds, progress=None, ignore=None, minoverviewdim=DEFAULT_MINOVERVIEWDIM, levels=DEFAULT_OVERVIEWLEVELS, aggregationType=None, approx_ok=False): """ Does both the stats and pyramid layers. Calls addPyramid() and addStatistics() functions. See their docstrings for details. """ if progress is None: progress = cuiprogress.SilentProgress() if ignore is not None: setNullValue(ds, ignore) addPyramid(ds, progress, minoverviewdim=minoverviewdim, levels=levels, aggregationType=aggregationType) addStatistics(ds, progress, ignore, approx_ok=approx_ok)
[docs]def setNullValue(ds, nullValue): """ Set the given null value on all bands of the given Dataset """ for i in range(ds.RasterCount): band = ds.GetRasterBand(i + 1) band.SetNoDataValue(nullValue)