/************************************************************
stat2Bin.C is a C++ program which counts a list of input
values that have been sampled along one coordinate (by any other
program) into bins of equal size. The input is an ASCII file
with two types of lines
i) lines starting with a hash (#) which are ignored
ii) lines starting with any white space followed by a floating
point number (like 1.23e5 or 3332 or -12e-5) optionally
followed by more white space and other bytes which are ignored.
The length of the lines in this file is limited to 160 bytes
but can be changed by editing the preprocessor variable LL
below prior to compilation. These first numbers in the
non-comment lines are the sample that is to be binned.
The standard output contains some comments (i.e., lines that
start with a hash so they are ignored by parsers like gnuplot)
which contain the mean and standard deviation over the input
samples (computed independent of the binning parameters). The
output contains also lines with three columns which contain
from the left to the right
i) the center of the bin in the units of the input data.
If the preprocessor variable STAT2BIN_ONE_OUT_PER_BIN is
commented before compilation, the data are duplicated in the
sense that two lines per bin will be produced with the
first representing the (x,y) coordinate on the left upper
corner of the bin graph, the 2nd representing the (x,y)
coordinate of the right upper corner of the graph.
ii) the positive integer number of samples falling into that bin
iii) the cumulative sum over the data falling into this
or any prior bin. This is useful to quickly get
percentiles if the last number in the last row is subdivided
and the result backwards interpolated from the last/third row
into the first.
The bins are listed continously, which means bins without
a hit in the input data are explicitly listed with a zero in the
second column.
Complaints on use etc are forwarded to stderr.
Compilation:
make stat2Bin
Some compilers may eject a warning about double-to-integer
conversion. The coding is correct according to the standards
and has not been adapted to such behavior.
Syntax:
stat2Bin [-b binwidth] [-f from] [-t to] inputf > outputf
Command line options and arguments:
-b the width of each bin in the units of the data of the
input file. If the option is not used, a default of 1
is assumed.
-f smallest input value at the "lower" edge of the first
(leftmost) bin. If the value of this option happens to
be larger than some of the values in the input file,
the first bin piles up these too.
If the option is not used, a default of 0 is assumed.
-t highest input value at the "upper" edge of the last
(rightmost) bin. If the value of this option happens to
be smaller than some of the values in the input file,
the last bin piles up these too.
If the option is not used, a default of 100 is assumed.
inputf this is a mandatory command line argument which
specifies the UNIX file name of the input data.
Example:
cat wdsnew*.txt | wdsToSql.pl > wds.sql
sqlite3
sqlite> .read wds.sql
sqlite> .output wdstmp
sqlite> SELECT sepfst FROM wds WHERE magfst < 10;
sqlite> .output wdstmp2
sqlite> SELECT sepfst FROM wds WHERE magfst < 10 AND magsnd <10;
sqlite> .read orb6.sql
sqlite> .output wdstmp3
sqlite> SELECT a FROM orb6 WHERE magfst < '10' AND au='a' AND ( Pu='y' AND P<'4' AND P>'0.5' OR Pu='d' AND P>'182' AND P<'1460');
sqlite> SELECT 0.001*a FROM orb6 WHERE magfst < '10' AND au='m' AND ( Pu='y' AND P<'4' AND P>'0.5' OR Pu='d' AND P>'182' AND P<'1460');
sqlite> .output wdstmp4
sqlite> SELECT a FROM orb6 WHERE magfst < '10' AND au='a' AND ( Pu='y' AND P>='4' OR Pu='d' AND P>'1460');
sqlite> SELECT 0.001*a FROM orb6 WHERE magfst < '10' AND au='m' AND ( Pu='y' AND P>='4' OR Pu='d' AND P>'1460');
sqlite> .exit
stat2Bin -b 2 -f 0.01 -t 120 wdstmp > wdsSepStat
samples input data from 0 to 120 as into bins of width 2.
Richard J. Mathar, http://www.strw.leidenuniv.nl/~mathar, 2005-11-07
*************************************************************/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
/* maximum number of bins on output; increase in the extraordinary
* case that this is not fine enough
*/
#define STAT2BIN_MAXBINNR 6000
/* generally, try to define steps such that
* gnuplot> plot "outputf" wi li
* will plot a nice histogram. The definition of the following macro assumes that
* only one point per bin is generated, as with
* gnuplot> set style data histeps
* gnuplot> plot "outputf"
*/
#define STAT2BIN_ONE_OUT_PER_BIN
using namespace std ;
/* maximum length of the input lines
*/
#define LL 160
void usage(char *argv0)
{
cerr << "usage: " << argv0 << "[-b binwidth] [-f from] [-t to] infile > outfile" << endl ;
}
/** compute quantile statistics over the values of the vector
* @param[in,out] dat the vector of the input values. On output, the
* vector is sorted along increasing values.
* @param[in] qu is a value between 0 and 1 corresponding to the
* quantile to be computed. For the median use 0.5, for the two sigma
* range (95.4 percent) use 0.977249868 or 0.022750132, for the three
* sigma range (99.7 percent) use 0.9986501 or 0.0013499.
* @return the abscissa value on the cumulative sum. On the curve of
* the cumulative sum of the values in \c dat, 100*qu percent
* of the values are smaller than the value returned.
* @since 2007-02-12
* @author Richard J. Mathar
*/
double quanti(vector &dat, const double qu)
{
/** we first sort the values with a call to an STL sort()
* such that a quick one-step interpolation between indices and values
* returns the value.
*/
#ifdef TEST
cout << "# unsorted " << endl ;
for(int i=0 ; i < dat.size() ; i++)
cout << i << " " << dat[i] << endl ;
#endif
sort(dat.begin(),dat.end()) ;
#ifdef TEST
cout << "# sorted " << endl ;
for(int i=0 ; i < dat.size() ; i++)
cout << i << " " << dat[i] << endl ;
#endif
const int N=dat.size() ;
if( qu <= 0.)
return 2.*dat[0]-dat[1] ;
else if ( qu >= 1.)
return 2.*dat[N-1]-dat[N-2] ;
else
{
/* This \c indx says that \c fndx of the values (out of a total of \c N)
* are to be left of the abscissa value (to be returned), the other right to it.
* The subtraction of 1 indicates the conversion to the 0-based C/C++ indexing.
* Example: if qu*N=4.2, the four values with C-indices 0,1,2 and 3 are to be
* smaller than the value returned.
*/
double fndx = qu*N-1 ;
int indx = fndx ;
/* The return value is obtained by linear interpolation between
* ( indx,dat[indx]) and (indx+1,dat[indx+1]).
*/
return dat[indx]+fmod(indx,1.)*(dat[indx+1]-dat[indx]) ;
}
}
/** compute quantile statistics over the values of the vector
* @param[in,out] dat the vector of the input values
* @param[out] qu on return, qu[0] contains the median (50 percent quantile),
* qu[0] contains the 15.86 percent quantile (which is the 1 sigma value
* of a normal distribution, qu[1] contains the 84.13 percent quantile (which
* is the 1 sigma value of a normal distribution.
* @since 2007-02-12
* @author Richard J. Mathar
*/
void quanti(vector &dat, double qu[3])
{
qu[0]=quanti(dat,0.5) ;
qu[1]=quanti(dat,0.158655254) ;
qu[2]=quanti(dat,0.841344746) ;
}
/** Main executable of stat2Bin.
* @param[in] argc the number of UNIX command line arguments, including
* the name of the executable. Correct use means this parameter is between
* 2 (only the infile is given) up to 8 (all three options are used).
* @param[in] argv pointers to the starts of the command line arguments.
* @return 0 if successful, 1 if not.
* The return value of 1 indicates either
* - the mandatory command line argument with the file name is missing
* - the file name does not refer to a readable file
* @since 2005-11-07
* @author Richard J. Mathar
*/
int main(int argc, char *argv[])
{
float valBw = 1. ; // default is binning in slices of 1. "data units"
/* sampling range. vrange[0] the minimum value, default zero,
* and vrange[1] the maximum value, default 100.
*/
double vrange[2] ={0., 100.} ;
if( argc < 2)
{
usage(argv[0]) ;
return 1 ;
}
/* The flag of any command line option */
char oc ;
while ( (oc=getopt(argc,argv,"b:f:t:")) != -1 )
{
switch(oc)
{
case 'f' :
vrange[0] = atof(optarg) ;
break ;
case 'b' :
valBw = atof(optarg) ;
break ;
case 't' :
vrange[1] = atof(optarg) ;
break ;
case '?' :
cerr << "Invalid command line option " << optopt << endl ;
break ;
}
}
ifstream f;
f.open(argv[optind]) ;
if ( ! f)
{
cerr << "Coudn't open " << argv[optind] << endl ;
return 1 ;
}
/* \c counts is the array of bin occupancies, initially zero
* (the initialization is not needed with C++ compilers that comply...)
*/
int counts[STAT2BIN_MAXBINNR] ;
for(int i=0 ; i < STAT2BIN_MAXBINNR; i++)
counts[i] =0 ;
/* countsm[0] keeps track of the minimum bin index that has
* at least occupancy 1. countsm[1] keeps track of the maximum index that
* has at least occupancy 1. The total number of input values is kept
* in countsm[2].
*/
int countsm[3] ;
countsm[0]= STAT2BIN_MAXBINNR ; countsm[1]=countsm[2]=0 ;
/* for statistical analysis: sum of the values and sum of the values squared
* No values have been read yet and these start at zero.
*/
double moms[2] = {0., 0.} ;
/** For some kind of higher statistics, we accumulate the entire input
* vector in \c dat
*/
vector dat ;
for(;;)
{
char buf[LL+1] ;
/* read the next line from the input file
*/
f.getline(buf,LL) ;
/* skip it if it starts with a hash
*/
if ( strncmp(buf,"#",1) && strlen(buf)>0 ) // a line that does not start with the hash
{
#ifdef TEST
cerr << "reading " << buf << endl ;
#endif
/* read the first value into \c val
*/
double val ;
sscanf(buf,"%le", &val) ;
dat.push_back(val) ;
/* compute a bin number for this value by linear interpolation
* between the leftmost and rightmost edges of the first and last
* bin. If the value falls outside these ranges, put it into either
* the first or the last bin.
*/
int binnr = (val-vrange[0])/valBw ;
if ( binnr >= STAT2BIN_MAXBINNR)
binnr = STAT2BIN_MAXBINNR-1 ;
if ( binnr < vrange[0])
binnr = STAT2BIN_MAXBINNR-2 ;
counts[binnr]++ ;
countsm[2]++ ;
/* update the sum over all input values and the sum over these squared
*/
moms[0] += val ;
moms[1] += val*val ;
#ifdef TEST
cerr << "plus " << binnr << " " << vrange[0] << " " << vrange[1] << endl ;
#endif
countsm[1] = max(countsm[1],binnr) ;
countsm[0] = min(countsm[0],binnr) ;
}
if ( f.eof() )
break ;
}
f.close() ;
double qu[3] ;
time_t now = time(0) ;
cout << "# generated with " << argv[0] << " from " << argv[optind] << " " << ctime(&now) ;
cout << "# " << countsm[2] << " counts between bins " << countsm[0] << " and " << countsm[1] << endl ;
#ifdef TEST
cerr << "sum " << moms[0] << " " << moms[1] << endl ;
#endif
/* mean is sum divided by the number of values.
* Std Dev is sqrt[sum(v-vbar)^2/(n-1)] = sqrt[(sum v^2 -2vbar*sum v+vbar^2*n)/(n-1)].
* = sqrt[(sum v^2 -2vbar*n*var+vbar^2*n)/(n-1)] = sqrt[(sum v^2 -vbar^2*n)/(n-1)]
*/
moms[0] = moms[0]/countsm[2] ;
moms[1] = sqrt((moms[1]-countsm[2]*moms[0]*moms[0])/(countsm[2]-1)) ;
cout << "# mean " << moms[0] << " std dev " << moms[1] << endl ;
quanti(dat,qu) ;
cout << "# median " << qu[0] << " + " << qu[2]-qu[0] << " - " << qu[0]-qu[1] << endl ;
// output of the binning results
int sumCounts=0 ;
for(int i=countsm[0]; i <= countsm[1] ; i++)
{
#ifdef STAT2BIN_ONE_OUT_PER_BIN
sumCounts += counts[i] ;
cout << vrange[0]+(i+0.5)*valBw << " " << counts[i] << " " << sumCounts << endl ;
#else
cout << vrange[0]+i*valBw << " " << counts[i] << " " << sumCounts << endl ;
sumCounts += counts[i] ;
cout << vrange[0]+(i+1)*valBw << " " << counts[i] << " " << sumCounts << endl ;
#endif
}
return 0 ;
}
#undef STAT2BIN_ONE_OUT_PER_BIN
#undef STAT2BIN_MAXBINNR