/** \mainpage @defgroup cat2Sql ASCII table to FITS/SQL converter @since 2006-05-24 @author Richard J. Mathar \htmlonly R J Mathar home page \endhtmlonly \section NAME \c cat2Sql converts an ASCII catalog file into a SQL feed sequence file or a \htmlonly FITS \endhtmlonly \latexonly \href{http://heasarc.gsfc.nasa.gov/docs/heasarc/fits.html}{FITS} \endlatexonly file with a binary table. \section ENVIRONMENT No environment variables are used. \section INSTALLATION Under unix, a command @code unix> make cat2Sql @endcode is sufficient to compile the C++ source code into the executable. It may make sense to be more specific with the optimization and use s.th. like @code unix> c++ -O -o cat2Sql cat2Sql.cxx @endcode If the optional output into FITS binary tables is needed, the \htmlonly cfitsio \endhtmlonly \latexonly \href{http://heasarc.gsfc.nasa.gov/docs/software/fitsio/fitsio.html}{cfitsio} \endlatexonly and \htmlonly CCfits \endhtmlonly \latexonly \href{http://heasarc.gsfc.nasa.gov/docs/software/fitsio/ccfits/}{CCfits} \endlatexonly packages need to be available, and the header files and libraries be specified to the compiler and linker: @code unix> c++ -O -DHAVE_CCFITS -o cat2Sql -I cat2Sql.cxx -L -lcfitsio -lCCFits @endcode \section EXAMPLES -# The first lines of the configuration file for the \htmlonly WDS \endhtmlonly \latexonly \href{http://ad.usno.navy.mil/wds/}{WDS} \endlatexonly catalog could look as follows: @code A10 den bytes 1 to 10 A7 disc bytes 11 to 17 A5 comp I4 datefst first year I4 datelst last year 4X nobs skip the number of observations I3 posfst first position # teaser I3 poslst last position columns 43-45 F5.1 sepfst [mas] separation first @endcode If this is in a file wds.cfg, the SQL commands are created with @code cat wdsnew*.txt | cat2Sql -c wds.cfg > wds.sql @endcode supposed that the ASCII files had been in wdsnew*.txt . -# the first lines of the "alternative" configuration file of the \htmlonly CHARM2 \endhtmlonly \latexonly \href{ftp://cdsarc.u-strasbg.fr/pub/cats/J/A+A/431/773/}{CHARM2} \endlatexonly catalog could look as follows @code 1-4 I N [] sequential order in the table 6-22 A Source [] Name of the source 24 A VLTI [] Presence of VLTI observations 26-29 A Type [] Type of source 31-54 A Id1 [] cross-indentification 55-78 A Id2 [] cross-indentification 80-103 A Id3 [] cross-indentification 105-129 A Id4 [] cross-indentification 131-135 A Method [] Method of observation 137-142 A Lambda [] Wavelength of observation 144-149 F UD [mas] uniform disk angular diameter 151-155 F e_UD [mas] mean error on UD 157-161 F5.2 ULUD [mas] Upper limit for Uniform disk diameter 163-168 F6.2 LD [mas] Limb-darkened angular diameter 170-173 F4.2 e_LD [mas] Mean error on LD 175 A1 l_Shell Limit flag on Shell 176-181 F6.1 Shell [mas] Circumstellar shell 182-185 A4 n_Shell Note on Shell (Y, Y? or 20x3) 187-192 F6.1 Sep [mas] Binary: separation 196-200 F5.1 PA [deg] Binary: position angle 202-206 F5.1 Ratio Binary: Brightness ratio 208 I1 Ratio2 ? Upper limit of brightness ratio 209-210 A2 n_Ratio2 [2m] 212-213 A2 Bin Binary: type 215-331 A117 Com Comments extracted from the reference or inserted by the catalogue compiler 333-334 I2 RAh [h] Right Ascension J2000 336-337 I2 RAm [min] Right Ascension J2000 339-344 F6.3 RAs [s] Right Ascension J2000 346 A1 DE- Declination sign (J2000) 347-348 I2 DEd [deg] Declination (J2000) 350-351 I2 DEm [arcmin] Declination (J2000) 353-357 F5.2 DEs [arcsec] Declination (J2000) 359-366 F8.5 pmRA [as/a] Proper motion in right ascension 368-376 F9.6 pmDE [as/a] Proper motion in declination 378-382 F5.2 Bmag [mag] B magnitude 384-388 F5.2 Vmag [mag] V magnitude 390-394 F5.2 Kmag [mag] K magnitude (2.2um) 396 A1 f_Kmag *Code for Kmag 398-404 F7.2 12um [Jy] Flux at 12um 406 A1 f_12um Code for 12um flux 408-420 A13 SpType MK Spectral class 421-424 A4 Var Variability note 426-431 F6.2 Plx [mas] Parallax 433-438 F6.1 km/s RV [km/s] Radial velocity 440-448 A9 ref1 References, in refs.dat file 450-458 A9 ref2 References, in refs.dat file 460-468 A9 ref3 References, in refs.dat file @endcode If this is in the file charm2.cfg, it would be used like @code unix> cat2sql -C charm2.cfg > tmp.cfg unix> cat2sql -f -k N -c tmp.cfg < charm2.dat > charm2.fits @endcode \section NOTES The byte counts implied by the Fortran-specifiers in the configuration file are indeed byte counts. Tabulators, backspaces and other control characters should be expanded before feeding them into the program, see expand(1). If the original input data have a free-format, one may also pipe it through a corresponding awk(1) with internal print commands to generate a fixed format on the fly, which can become the standard input to this program here. Example: If the original data are a name, an integer and a floating-point number, the pipe might look like @code unix> cat origfile | awk '{print ".3s%4d%7f%10e",$1,$2,$3,$4}' | ... @endcode to cast this input into a fixed format with a total range of 3+4+7+10 bytes. The last lines of an output generated with the \c -C option show this type of command string for each particular configuration file. The individual lines of the input table must be terminated by the standard character of the C++ implementation, which is the LF on UNIX systems. If the source table does not follow this guideline, one must use other tools like cut(1), wrap(1), fold(1) etc to ensure that the input lines can be interpreted starting with the first byte in each line according to the sequence of bytes of the configuration file. @todo Handle P and G and maybe H Fortran specifiers. @todo Are there complex data formats in SQL? @todo We may need masking escape sequences for some input lines. @bug If the standard input contains special characters like parenthesis or quotation marks, the generated output will likely be confusing the SQL interpreter, because no masking of these has yet been added to this program. \section MAKEFILE \section SEE ALSO wds2Sql.pl **/ #include #include #include #include #include #include #include #include #include #include /* If the \c -DHAVE_CCFITS cpp option has been used, we include * the headers of \c CCfits and \c cfitsio, which define a number of * tags to specify column data types. If this cpp switch had not been * used, we make up our own ones. */ #ifdef HAVE_CCFITS #include using namespace CCfits ; #else #define TBIT 1 /* codes for FITS table data types */ #define TBYTE 11 #define TSBYTE 12 #define TLOGICAL 14 #define TSTRING 16 #define TUSHORT 20 #define TSHORT 21 #define TUINT 30 #define TINT 31 #define TULONG 40 #define TLONG 41 #define TINT32BIT 41 /* used when returning datatype of a column */ #define TFLOAT 42 #define TLONGLONG 81 #define TDOUBLE 82 #define TCOMPLEX 83 #define TDBLCOMPLEX 163 #define FLEN_VALUE 71 #endif using namespace std ; /** The Col class contains types and ranges (in units of bytes) in * an ASCII table of one coherent datum, that is one column in the * ASCII input file. * @since 2006-05-24 * @author Richard J. Mathar */ class Col { public: /** The byte range of this column in the input ASCII file. This is 0-based such * that \c col[0] =0 would indicate the first byte in each line. \c col[1] contains * the first byte after the column. If this is a column which is repeated, \c col[1] * refers to the first instance of the columns, such that it actually coincides with * the location of the first byte of the 2nd colum in the ASCII file. */ int col[2] ; /** Repetition number for vector type columns. Default is 1 which means * the next, adjacent column is specified by it's own instance of this class. * Resembles the \f$r\f$ specifier in the formatted input/ouput of Fortran. */ int repeat; /** the data type in the SQL sense. * This can be \c INTEGER, \c char, \c double etc. */ string sqlfmt ; /** The FITS column type in the \c cfitsio sense as defined in the \c fitsio.h * file of its C source. */ int ftype; /** Name of the column in the SQL database or the FITS binary table. * If the output is a FITS binary table, all characters will be forced into * uppercase. */ string dbcolname ; /** Comment of the column in the FITS sense. * This ought contain the physical unit of the data in the \c cfitsio convention, which * means in a first piece enbraced by \c [ .. \c ]. */ string comm ; /** \c true if the column is of numerical type , \c false for character * or name formats. */ bool numFormat ; /** Constructor. * @param[in] colstrt the byte number (0-based) meaning the first (leftmost) byte * in each of the ASCII input lines. * @param[in] f77fmt a width and type specifier in the Fortran convention. Examples * are \c 3I6 for 3 columns each 6 bytes wide with integers, or \c 10A strings * of 10 bytes widths, or \c F5.4 a single precision floating point number spanning * over 5 bytes. The numeric formats (\c I, \c F, \c E) may be blank-padded on the * left. * @param[in] name the column name. * @param[in] comment a comment on the meaning of the column. This would probably * start with a unit enclosed in brackets. * @since 2006-05-24 * @author Richard J. Mathar */ Col(int colstrt, const string f77fmt, const string name, const string comment = "") : dbcolname(name), repeat(1), comm(comment), ftype(-1) { col[0]=colstrt ; int wid =0 ; int prec =0 ; int r =0 ; ostringstream auxFmt ; // any repeat factor? Ignore attempts to introduce negative ones. sscanf(f77fmt.c_str(),"%d",&r) ; if ( r>1 ) repeat=r ; string::size_type f77pos= f77fmt.find_first_of("APGXDEIF") ; switch( f77fmt[f77pos] ) { case 'X' : sscanf(f77fmt.c_str(),"%d",&wid) ; col[1] = col[0]+wid ; repeat = 1 ; // no repeat factor allowed with X numFormat = false ; break ; case 'I' : sscanf(f77fmt.c_str()+f77pos+1,"%d",&wid) ; col[1] = col[0]+wid ; sqlfmt="INTEGER"; numFormat = true ; ftype= TLONG ; break ; case 'A' : sscanf(f77fmt.c_str()+f77pos+1,"%d",&wid) ; col[1] = col[0]+wid ; auxFmt << "char(" << wid << ")" ; sqlfmt=auxFmt.str() ; numFormat = false ; ftype= TSTRING ; break ; case 'D' : case 'E' : case 'F' : sscanf(f77fmt.c_str()+f77pos+1,"%d.%d",&wid,&prec) ; col[1] = col[0]+wid ; auxFmt << "DEC(" << wid ; if ( prec > 0 ) auxFmt << "," << prec ; auxFmt << ")" ; sqlfmt=auxFmt.str() ; numFormat = true ; break ; default: cerr << "Unrecognized field specification " << f77fmt << endl ; exit(EXIT_FAILURE) ; } switch( f77fmt[f77pos] ) { case 'D' : case 'E' : ftype=TDOUBLE ; break ; case 'F' : ftype=TFLOAT ; break ; } // cerr << " cols " << col[0] << " .. " << col[1] << " prec " << prec << " " << sqlfmt << endl ; } /** Clone a name by appending a number. * We attach the number r to the base name of the column to have * different column names in case this is used for SQL table construction. * @param[in] r the sequential number of this vector column. * @since 2006-05-24 */ void cloneName(int r) { ostringstream auxFmt ; auxFmt << dbcolname << r ; dbcolname = auxFmt.str() ; } #if INCLUDE_SUPERFLUOUS int txtWidth() const { return col[1]-col[0] ; } #endif /** Chop off a number of bytes (vertically) in the ASCII string. * This acts like the UNIX cut(1) operation for a consecutive byte range, and * leaves only the bytes that belong to this column intance here. * @param[in] ascii the ASCII line * @return the substring that consists of the consecutive bytes from within this * ASCII string as specified by this instances col parameter. * @since 2006-05-24 */ string cut(const string ascii, const bool warn) const { if ( ascii.length() < col[1]) { if ( warn ) cerr << "Warning: Input line " << ascii << " too short (" << ascii.length() << " found," << col[1] << " needed)\n" ; return string() ; } else return string(ascii,col[0],col[1]-col[0]) ; } /** Construct a representation in the printf style of awk. * @return a string in the awk(1) style, like %4d, %7f or %10s * @since 2007-03-07 */ string awkPrintf() const { char f[FLEN_VALUE+1] ; f[0]='\0' ; /* as in ffgbclll() of fitscore.c */ switch(ftype) { case -1 : // sprintf(f,"%%%ds",col[1]-col[0]) ; for(int i=col[0] ; i < col[1] ; i++) strcat(f," ") ; break ; case TBYTE : sprintf(f,"%%c") ; break ; case TSTRING : sprintf(f,"%%%ds",col[1]-col[0]) ; break ; case TSHORT : case TLONG : case TLONGLONG : sprintf(f,"%%%dd",col[1]-col[0]) ; break ; case TFLOAT : sprintf(f,"%%%df",col[1]-col[0]) ; break ; case TDOUBLE : sprintf(f,"%%%de",col[1]-col[0]) ; break ; case TBIT : case TLOGICAL : case TCOMPLEX : case TDBLCOMPLEX : default: /* no warning here with this version, because the output will only * be used in a comment. */ ; } return string(f) ; } /* Convert the name to upper case in support of FITS conventions. * @return the name converted to upper case, as proposed by the FITS standard. * @since 2007-02-28 */ string up_name() const { char uname[FLEN_VALUE+1] ; uname[0]='\0' ; strncat(uname,dbcolname.c_str(),FLEN_VALUE) ; for(int c=0 ; c < strlen(uname); c++) uname[c]=toupper(uname[c]) ; return string(uname) ; } /** Extract a unit from the comment. * @return the substring between the first pair of \c [ and \c ], * and an empty string if no such specification is in the comment. * @since 2007-02-28 */ string unit() const { /* Search for the opening and the closing bracket */ string::size_type op_bra = comm.find_first_of("[") ; string::size_type clo_bra = comm.find_first_of("]") ; /* If both are present and in the correct order, extract the * string in between, otherwise return an empty string. */ if ( op_bra != string::npos && clo_bra != string::npos && clo_bra > op_bra) return comm.substr(op_bra+1,clo_bra-op_bra-1) ; else return string("") ; } /** Create a binary table column type string * @since 2007-02-28 * @return the representation like \c J or \c D or \c 10A as requested * for \c cfitsio instantiation of this particular column. * @todo figure out whether this needs to be adorned with the repitition factor * for vector columns. */ string fitst() const { char f[FLEN_VALUE+1] ; /* as in ffgbclll() of fitscore.c */ switch(ftype) { case TBIT : sprintf(f,"X") ; break ; case TBYTE : sprintf(f,"B") ; break ; case TLOGICAL : sprintf(f,"L") ; break ; case TSTRING : sprintf(f,"%dA",col[1]-col[0]) ; break ; case TSHORT : sprintf(f,"I") ; break ; case TLONG : sprintf(f,"J") ; break ; case TLONGLONG : sprintf(f,"K") ; break ; case TFLOAT : sprintf(f,"E") ; break ; case TDOUBLE : sprintf(f,"D") ; break ; case TCOMPLEX : sprintf(f,"C") ; break ; case TDBLCOMPLEX : sprintf(f,"M") ; break ; } return string(f) ; } protected: private: } ; /* Output in the style that is conforming to the configuration file Cfg . * @param[in,out] os the byte stream to which this is printed. * @param[in] c the instance of Col to be printed. * @since 2007-02-28 */ ostream & operator<< (ostream &os, const Col &c) { if ( c.repeat > 1) os << c.repeat ; switch( c.ftype) { case TSTRING: os << c.col[1]-c.col[0] << "A" ; break ; case TLONG: os << "I" << c.col[1]-c.col[0] ; break ; case TFLOAT: os << "F" << c.col[1]-c.col[0] ; break ; case TDOUBLE: os << "D" << c.col[1]-c.col[0] ; break ; case -1: os << c.col[1]-c.col[0] << "X" ; break ; } os << " " << c.dbcolname << " " << c.comm ; return os ; } /** Comparison operator. * A column is smaller than another one if its byte location is left from it * in the ASCII input. * @param[in] left column to the left of the comparison sign * @param[in] right column to the right of the comparison sign * @return \c true or \c false * @author Richard J. Mathar * @since 2007-02-28 */ bool operator< (const Col & left, const Col & right) { return ( left.col[0] < right.col[0]) ; } /** This is the name of the SQL column with the uniqe key if no other one has * been specified on the command line. */ #define SQL_UNIQ_KEY_DEFAULT "sql_key" /** This class represents a range of columns as embodied by a configuration file. * @since 2006-05-24 * @author Richard J. Mathar */ class Cfg { public: /** The non-overlapping columns present in each row (each line of * the ASCII inpute). */ vector cols ; /** The name of the FITS binary table or the SQL table. */ string tblname ; /** Constructor. * @param[in] inp the file stream with the configuration file, positionend * at the first line. * @param[in] baseFname the name of the table to be created. * @param[in] sqlkey the name of the column that may server as a unique * key in the SQL sense (such that two different rows in the table do * never have the same entry in this column). * @bug the expansion of the vector columns needs to be suppressed * if the output is of FITS type. So it must be moved to the place * where the interface is an output to SQL. * @since 2006-05-24 * @author Richard J. Mathar */ Cfg(ifstream & inp, const char *baseFname, const string sqlkey) : pkey(sqlkey) { /* Construct the SQL table name from the baseFname without the suffix. * Supposed that this is a default chosen by the program, there would * probably a \c .cfg be part of the string, that we discard. */ tblname = baseFname ; string::size_type dot= tblname.find('.') ; if ( dot != string::npos) tblname.erase(dot) ; while ( !inp.eof() ) { /* one line of the configuration file is in buf */ string buf ; getline(inp,buf) ; /* skip comment lines or lines that are empty. */ if ( buf[0] == '#') continue ; if ( buf.empty() ) continue ; /* debugging /*cout << buf << endl ; */ istringstream line(buf) ; string f77fmt, dbcolname, commen ; /* from the configuration line we read the Fortran type specifier, * the column name, and push the rest of the line (potentially including * the units) into the comment */ line >> f77fmt >> dbcolname ; /* debugging /* cout << f77fmt << " " << dbcolname << endl ; */ getline(line,commen,'\0') ; /* We create one instance of Col from this line of * the configuration file. */ Col field(bWidth(),f77fmt,dbcolname,commen) ; if ( field.repeat > 1 ) { /* If this is a vector field, we create multiple copies such * that each of the columns is represented. * @bug this is an incorrect place to do this since we would * like to create FITS vector columns without such duplication. */ for(int r=1 ; r <= field.repeat ; r++) { Col field_rep(bWidth(),f77fmt,dbcolname) ; field_rep.cloneName(r) ; cols.push_back(field_rep) ; /* debugging /* cerr << " col " << field_rep.dbcolname << " " /* << field_rep.col[0] << ".. " << field_rep.col[1] << endl ; */ } } else cols.push_back(field) ; } /* If the user didn't specify a SQL key, generate our own one * with the default name, leaving room for integers with up to 6 decimal places, * which ought be large enough for most applications.. */ if ( pkey.length() == 0 ) { pkey= SQL_UNIQ_KEY_DEFAULT ; Col field(-1,"I6",pkey) ; cols.push_back(field) ; autopkey=true ; } else autopkey=false ; } /** Construct a representation in the printf style of awk. * @return a string in the awk(1) style, like "%4d%7f%10s",$1,$2,$3 * @since 2007-03-07 * @todo the implementation duplicates Cfg::awkPrintf() which ought be * merged into an interface in the Cols class to avoid this. */ string awkPrintf() const { ostringstream fmt ; /* first construct the format specifiers, calling Col::awkPrintf() in a * loop over all columns in the table. */ fmt << "\"" ; for(int c=0 ; c < cols.size() ; c++) fmt << cols[c].awkPrintf() ; fmt << "\"" ; /* append the list of the table columns. */ int field=1 ; for(int c=0 ; c < cols.size() ; c++) if ( cols[c].ftype != -1 ) fmt << ",$" << field++ ; return fmt.str() ; } /** construct the CREATE TABLE sql command. * @since 2006-05-24 * @return the string with the \c CREATE \c TABLE SQL command, covering * the full table width.. */ string creatTbl() const { string resul="CREATE TABLE " ; resul.append(tblname) ; resul += " (" ; bool firsten = true ; /* this is essentially a loop across all columns, concatened in * the syntax required for the \c CREATE \c TABLE */ for(int i=0 ; i < cols.size() ; i++) { if( cols[i].sqlfmt.empty() ) // skip X formats.. continue ; if( !firsten) resul += ", " ; resul += cols[i].dbcolname + " " + cols[i].sqlfmt ; firsten=false ; } resul += ", PRIMARY KEY (" + pkey + ")" ; resul += " )" ; // no semicolon appended return resul ; } /** construct a 2-line string with the "created" information in SQL comment format. * @return a SQL comment that is a useful tag for the SQL command line. * @since 2006-05-24 */ string creatDoc(int argc, char *argv[]) const { string resul= "-- created " ; time_t now ; time(&now) ; resul += ctime(&now) ; resul += "-- by " ; resul += getenv("USER") ; resul += " running " ; resul += argv[0] ; resul += " (author Richard J. Mathar) with options " ; for(int i=1 ; i < argc ; i++) { resul += argv[i] ; resul += " " ; } return resul ; } /** Construct the INSERT sql command for the ascii line provided. * @param[in] ascii the ASCII line from the input * @param[in] warn a flag to indicate if warnings about incompatibility * with the format of the configuration should be emitted. * @return the \c INSERT SQL command equivalent to the parsed ascii line. * @since 2006-05-24 */ string creatInsrt(const string ascii, const bool warn) const { static int recordNo =0 ; string resul="INSERT INTO " ; resul.append(tblname) ; resul += " (" ; bool firsten = true ; for(int i=0 ; i < cols.size() ; i++) { if( cols[i].sqlfmt.empty() ) // skip X formats.. continue ; if( !firsten) resul += ", " ; resul += "\"" + cols[i].dbcolname + "\"" ; firsten=false ; } resul += ") VALUES(" ; firsten = true ; /* the natural number of columns may or may not include the last column * which we may have generated for the auto-generated primary key. */ const int natcolNo = autopkey ? ( cols.size()-1 ) : cols.size() ; // handle all but the primary key (last of the columns) for(int i=0 ; i < natcolNo ; i++) { if( cols[i].sqlfmt.empty() ) // skip X formats.. continue ; if( !firsten) resul += ", " ; if ( ! cols[i].numFormat ) resul += "' " ; resul += cols[i].cut(ascii,warn) ; if ( ! cols[i].numFormat ) resul += "' " ; firsten=false ; } if ( autopkey) { // handle the primary key resul += ", " ; ostringstream auxFmt ; auxFmt << ++recordNo ; resul += auxFmt.str() ; } resul += ")" ; return resul ; } #ifdef HAVE_CCFITS /** Insert a representation of the ascii line into the HDU. * @param[in,out] hdu the header-data unit into which one row ought be modified. * @param[in] ascii the information of one line of the ASCII input table. * @param[in] firstrow the row number of the HDU (1-based) into which the information * is to be inserted. This may be larger or smaller than the current size of * the columns, and is known to be a slow operation if this actually appends * information in a new row. * @param[in] warn a flag that indicates whether complaints on mismatches * with the configuration should be emitted. * @since 2007-02-28 */ void creatInsrt(ExtHDU & hdu, const string ascii,const long firstrow, const bool warn) const { for(int i=0 ; i < cols.size() ; i++) { if( cols[i].ftype < 0 ) // skip X formats and the one with the pkey continue ; Column & fc = hdu.column(cols[i].dbcolname) ; /* cut won't work for the pkey columns of -1 */ if ( cols[i].col[0] < 0 ) continue ; string entr = cols[i].cut(ascii,warn) ; /* debugging * cerr << "handl |" << entr << "| " << cols[i].dbcolname << " " << firstrow << endl ; */ istringstream entrStr(entr) ; /* the CCfits interface allows only strong-typed \c write commands, so * we need dummy declarations for the variety of data types that may occur */ double tDouble ; float tFloat ; long tLong ; vector tstr ; /* Currently only \c A, \c J, \c F and \c D type of columns are implemented */ switch( cols[i].ftype) { case TSTRING : tstr.push_back(entr) ; fc.write(tstr,firstrow) ; break ; case TLONG : entrStr >> tLong ; fc.write(&tLong,1L,firstrow) ; break ; case TFLOAT : entrStr >> tFloat ; fc.write(&tFloat,1L,firstrow) ; break ; case TDOUBLE : entrStr >> tDouble ; fc.write(&tDouble,1L,firstrow) ; break ; case TBIT : case TBYTE : case TLOGICAL : case TSHORT : case TLONGLONG : case TCOMPLEX : case TDBLCOMPLEX : ; // it is a bug in gcc 4.1.1 that this semi-colon must be here } } } #endif /* HAVE_CCFITS */ /** A cross check that all SQL column names are mutually different * If some of the column names are the same, we write a warning * to \c stderr. * @since 2006-05-24 */ void uniq() { for(int i=0; i < cols.size()-1 ; i++) for(int j=i+1; j < cols.size() ; j++) if( cols[i].dbcolname == cols[j].dbcolname) cerr << "Warning: columns " << i << " and " << j << " share the same name \"" << cols[i].dbcolname << "\"\n" ; } /** A check that the primary key actually exists as a column * If the primary key does not match an existing column name, we write a warning * to \c stderr on this fact. * @since 2006-05-24 */ void keyExist() { for(int i=0; i < cols.size() ; i++) if ( cols[i].dbcolname == pkey) return ; cerr << "Warning: primary key column " << pkey << " does not exist\n" ; } #ifdef HAVE_CCFITS /** Convert into a FITS binary table. * @return a pointer to the FITS representation of the generated file. * Note that the binary table is generated here, but not yet filled * with the actual data, which is done in starCat::outfits(). * @author Richard J. Mathar * @since 2007-02-28 */ operator FITS*() { /* construct the new FITS object with the option to write * it on \c stdout when the object is destroyed. */ FITS *fits = new FITS("-",Write) ; fits->pHDU().makeThisCurrent() ; /* A primitive trace of when this was created is left in the PHDU. */ fits->pHDU().addKey("CREATOR",__FILE__,"Program to create this file (R.J. Mathar)") ; fits->pHDU().writeDate() ; /* binary table column names, units and formats are assembled * to characterize the FITS table. So far no one of the formatting specs * is forwarded. */ vector colNams ; vector colUns ; vector colFmt ; for(int c=0 ; c < cols.size() ; c++) { if( cols[c].ftype != -1) { colNams.push_back( cols[c].up_name() ) ; colUns.push_back( cols[c].unit() ) ; /* nA for ASCII, else "E", "D", "E", "J", "I" */ colFmt.push_back( cols[c].fitst() ) ; } } fits->addTable(tblname,0,colNams,colFmt,colUns) ; #if 0 /* in some future update one might want to specify * how empty input is handled in terms of NULL FITS values. */ ExtHDU & e = fits->extension(tblname) ; e.column("HDN").addNullValue(-1L) ; e.column("HIC").addNullValue(-1L) ; e.column("FK6").addNullValue(short(-1)) ; e.column("SAO").addNullValue(-1L) ; e.column("INDX").addNullValue(short(-1)) ; #endif return fits ; } #endif protected: private: /** compute the total ASCII width (so far), which is the lowest (0-based) byte not yet recognized * by the input parser * @return the rightmost byte number (0-based) which is not yet covered by any * of the columns. This is the potentially safe place to append a new column. * @since 2006-05-24 */ int bWidth() const { int resul=0 ; for(int i=0; i < cols.size() ; i++) resul=max(resul,cols[i].col[1]) ; return resul ; } /** boolean flag indicating whether we created the unique primary SQL key locally or * whether it was taken from a user specification. */ bool autopkey ; /** the primary SQL key */ string pkey ; } ; #undef SQL_UNIQ_KEY_DEFAULT /** Output of a configuration represented in altCfg style in the style of the * configuration file Cfg . * This is done by looping over all the individual columns. * @param[in,out] os * @param[in] cfg * @since 2007-03-07 */ ostream & operator<< (ostream &os, const Cfg &cfg) { for(int c=0 ; c < cfg.cols.size() ; c++) os << cfg.cols[c] << endl ; return os ; } /** A class with the simpler configuration by specification of byte ranges. * The simplification relative to the Cfg class is * - there is no need to characterize each byte in the input file, which means * one may leave holes in the byte ranges which will effetively be skipped. * These would have to be counted explicitly and used with \c X in the Cfg * class. * - The order of the colum specifications is arbitrary. * - Most astronomic data tables have documentation that specifies byte * ranges starting with 1 at the start of the lines. This is equivalent * to what is the input to this class here. * @since 2007-02-28 */ class altCfg { public: /** the specifications of the columns in the configuration */ vector cols ; /** ctor. * @param[in] inp the file stream with the alternative input configuration */ altCfg(ifstream & inp) { /* We loop over all lines of the configuration file */ while ( !inp.eof() ) { string buf ; getline(inp,buf) ; /* skip comment and empty lines */ if ( buf[0] == '#') continue ; if ( buf.empty() ) continue ; istringstream line(buf) ; string bspec, f77fmt, dbcolname, commen ; /* each lines has a byte range specifier in the \c start * or \c start-end format, a Fortran specifier (where the width becomes * redundant because it is implicit in the byte range), a name of the column * for use in the SQL table or binary table, and a residual part treated * as a comment (and optionally containing physical units) */ line >> bspec >> f77fmt >> dbcolname ; /* push rest of the line into the comment */ getline(line,commen,'\0') ; int bwidth ; const int cstart = bStrt(bspec,bwidth) ; // f77fmt may contain a blank character, a character with a repetition, // and a character with a field width and/or precision string::size_type f77pos= f77fmt.find_first_of("APGXDEIF") ; /* discard invalid type specifications */ if ( f77pos == string::npos) { cerr << "# invalid type spec " << f77fmt << " in " << buf << endl ; continue ; } if ( f77fmt.at(f77pos) != 'X' ) { /* we overwrite whatever may follow the character by the field width from the first * part of the spec * @todo this ought actually honor any more detailed spec from the * input line. */ f77fmt.erase(f77pos+1) ; char bwidthstr[4]; snprintf(bwidthstr,4,"%d",bwidth) ; f77fmt.append(bwidthstr) ; /* debugging * cerr << "# " << cstart << " " << f77fmt << " " << dbcolname << " " << commen << endl ; */ /* For each line we create one instance of Col and add it to the * vector cols . */ Col field(cstart,f77fmt,dbcolname,commen) ; cols.push_back(field) ; } } } /** Sort the columns from left to right according to the byte placement * in the input files. The main action is to insert dummy \c X columns * to render this format compatible with the main configuration file, * such that coverage of an entire line is ensured. * @since 2007-02-28 */ void sortnFill() { /** place the columns in increasing order so we can detect * gaps by scanning them linearly */ sort(cols.begin(),cols.end()) ; /** fill in X specifiers if there are gaps; these are appended instead of * merged into the vector, because insertion would garble the iterators */ int currCol =0 ; /* The cols.size() will change while looping, so we save it beforehand */ const int tablc = cols.size() ; for(int c=0 ; c < tablc ; c++) { if ( cols[c].col[0] > currCol) { ostringstream f77fmt, cmt ; f77fmt << cols[c].col[0]-currCol << "X" ; cmt << "dummy" << currCol << " auto-generated by "__FILE__ ;; Col xcol(currCol, f77fmt.str() , cmt.str()) ; cols.push_back(xcol) ; } currCol = cols[c].col[0]+cols[c].repeat*(cols[c].col[1]-cols[c].col[0]) ; } /** place the columns in increasing order */ sort(cols.begin(),cols.end()) ; } /** Construct a representation in the printf style of awk. * @return a string in the awk(1) style, like "%4d%7f%10s",$1,$2,$3 * @since 2007-03-07 */ string awkPrintf() const { ostringstream fmt ; /* first construct the format specifiers, calling Col::awkPrintf() in a * loop over all columns in the table. */ fmt << "\"" ; for(int c=0 ; c < cols.size() ; c++) fmt << cols[c].awkPrintf() ; fmt << "\"" ; /* append the list of the table columns. */ int field=1 ; for(int c=0 ; c < cols.size() ; c++) if ( cols[c].ftype != -1 ) fmt << ",$" << field++ ; return fmt.str() ; } protected: private: /** decompose the \c start-\end string into the start byte and * the byte widths. * @param[in] bspec a single number (meaning one byte only) or a range * of bytes separated by a dash. * @param[out] bwidth the width of the column in units of bytes. * @return the byte number of the start (0-based) * @since 2007-02-28 */ int bStrt(const string bspec, int & bwidth) const { istringstream line(bspec) ; int resul[2] ; /* the first number denotes the first byte */ line >> resul[0] ; /* We search for the dash. If it is present we skip it * and put the second number in resul[1]. */ string::size_type dashpos= bspec.find_first_of("-") ; if ( dashpos != string::npos) { char dash ; line >> dash >> resul[1] ; } else resul[1] = resul[0] ; /* The input was 1-based. on output we convert it to 0-based indexing */ bwidth = resul[1]-resul[0]+1 ; return resul[0]-1 ; } } ; /** Output of a configuration represented in altCfg style in the style of the * configuration file Cfg . * This is done by looping over all the individual columns. An aid of how * ASCII files that use tabulator stops in * \htmlonly * starbase style * \endhtmlonly * \latexonly * \href{http://cfa-www.harvard.edu/~john/starbase/starbase.html}{starbase style} * \endlatexonly * can be formatted with the UNIX * \htmlonly * awk * \endhtmlonly * \latexonly * \href{http://unixhelp.ed.ac.uk/CGI/man-cgi?awk}{awk} * \endlatexonly * is added as a comment at the end. * @param[in,out] os * @param[in] cfg * @since 2007-02-28 */ ostream & operator<< (ostream &os, const altCfg &cfg) { for(int c=0 ; c < cfg.cols.size() ; c++) os << cfg.cols[c] << endl ; os << "# Hint: conversion of rgb tab-separated input files into blank separated:\n" ; os << "# awk -F '\\t' '{ printf " << cfg.awkPrintf() << "}' < rgb.txt > ascii.txt\n" ; return os ; } /** Print a syntax note to \c stderr. * @param[in] argv the UNIX command line arguments, including the command name (executable) */ void usage(char *argv[]) { cerr << "usage: " << argv[0] << " [-f] [-w] [-k keyname] [-a] [-n sqlTblname] -c fmt.cfg < inputcat.txt > outfile.sql " << endl ; cerr << "usage: " << argv[0] << " -C altfmt.cfg < simpleAltFmt.cfg > fmt.cfg " << endl ; } /** \section SYNOPSIS @code cat2Sql [-w] [-k keyname] [-a] [-n sqlname] -c fmt.cfg < inputcat.txt > out.sql cat2Sql -f [-w] [-k keyname] [-a] [-n sqlname] -c fmt.cfg < inputcat.txt > out.fits cat2Sql -C altfmt.cfg > fmt.cfg @endcode \section DESCRIPTION The program converts an ASCII table provided by the standard input into a sequence of SQL commands (which would be used with the sqlite3(1) command .read, for example, or a similar mysql(1)) that feed a SQL interpreter with the contents of this table. Each line of the standard input becomes one line in the SQL table; the column layout of the standard input and the names of the columns in the SQL table must be detailed with the mandatory '-c' switch pointing to an ASCII configuration file with read-access rights. \section OPTIONS -a The program usually ignored lines of the standard input that start with the '#' character. This can be disabled by using the command line switch '-a' ("all") which triggers that even the lines that start with the sharp are considered to end up in the SQL database. -c Names the configuration file that details the widths and names of the table columns of the standard input. -C Indicates that a configuration file altfmt.cfg with an alternative format is converted into the standard format for later use with the option -c. -f Specifies that the standard output should be in the FITS format. Only available if enabled at compile time. -n Changes the SQL table name that is created from the default (which is the configuration file name after removal of any directory name and/or suffix) to a user-specific value. -w Switches on a higher level of warnings on stderr. This current includes (i) Warnings if SQL column names are not unique. (ii) Warnings if the standard input has too short lines to fill the SQL table with all values. (iii) Warnings if a column selected with the -k switch doesn't exist. -u Select one of the column names mentioned in the configuration file as the SQL key. The argument of the switch is the name of the column. If this column is specified with a repetition factor (see below as explained in the configuration file), the argument of the switch must include the explicit index number. \section FILES stdin The standard input must contain lines that are at least as long as demanded by the concatenation of the format specifiers of the configuration file (see below). Trailing columns/bytes will be ignored. The standard input must contain what would be called fixed format in the Fortran nomenclature, which means that the start and end of its columns is determined by the byte position within each row, not by separators (delimiters) like spaces, commas etc. configuration file The configuration file contains - empty lines which are ignored - lines that start with the '#' and are ignored - lines that contain (i) optional white space, (ii) a Fortran-type format/width specifier, The Fortran-type format specifiers are one of the following: - An (the capital letter A followed by a positive integer n) indicating an "ASCII" column of width 'n' - nX (a positive integer n followed by the capital letter X) indicating a number of 'n' Bytes to be skipped - In (the capital letter I followed by a positive integer n) indicating an (optionally signed) integer number with up to n bytes. - Fw.d - Ew.d - Dw.d (a letter F, E or D followed by a width number w, a dot and a decimal precision number d) indicating a floating point number. The width w includes any signs, decimal points, the letter 'e' etc. The A,F,E and D specifier can be preceeded by positive integers to indicate field repetition. The SQL column names will be generated in this case by attaching the numbers 1, 2 and so on to the SQL column name that follows. The lines in the standard input must then provide a multiple of 'w' bytes to fill these repeated columns with information. (iii) white space, (iv) a SQL/FITS column name, (v) optionally white space and trailing contents. This is generally ignored but is used to generate the units and the comment in case of output in FITS format. The units follow the cfitsio convention that they are the first part of the comment, enclosed by brackets. alternative configuration file The alternative configuration file can be translated into the standard format with a call using the -C option. It is a little bit more user-friendly and contains - empty lines which are ignored - lines that start with the '#' and are ignored - lines that contain (i) optional white space, (ii) a byte-range: a single integer with a 1-based byte number or two integers separated by a dash (but no additional white space) indicating a byte range. This represents the range of bytes in each input line of the ASCII table for one column in the final FITS or SQL table. (iii) a Fortran-type format/width specifier, (iv) white space, (v) a column name, (vi) optionally white space and trailing contents (which is ignored). The Fortran-type format specifiers are one of the following: - A (the capital letter A) indicating an "ASCII" column - I (the capital letter I) indicating an (optionally signed) integer number. - F - E - D (a letter F, E or D) indicating a floating point number. The A,F,E and D specifier can be preceeded by positive integers to indicate field repetition. In these cases, the byte-range specified before represents only the first of these columns. @since 2006-05-24 @author Richard J. Mathar */ int main(int argc, char *argv[]) { /* command line switches */ char oc ; /* The unix file name with a configuration file (both syntaxes) */ string cfgfile ; /* name of the SQL or binary FITS table */ string sqlTabname ; /* SQL unique key (its column name ) */ string sqlkey ; /* flag to indicate that also configuration lines starting with hashes * are to be interpreted instead of being treated as comments */ bool allCfg = false ; /* warning level */ bool warn = false ; /* If true, generated output is a FITS file with a binary table, * else a sequnece of SQL feeds. */ bool useFits = false ; /* If true, only a conversion from the simple to the compact * syntax of the configuraion is performed, else the standard input * is used to create a table in one of the two formats. */ bool cfg2cfg = false ; /* trivial syntax check */ if( argc < 3) { usage(argv) ; return 1 ; } /* potential command line options include the \c f switch if * this was sucessfully was compiled with \c CCfits support. */ #ifdef HAVE_CCFITS while ( (oc=getopt(argc,argv,"C:fc:an:wk:")) != -1 ) #else while ( (oc=getopt(argc,argv,"C:c:an:wk:")) != -1 ) #endif { switch(oc) { case 'c' : cfgfile = optarg ; break ; case 'C' : cfg2cfg = true ; cfgfile = optarg ; break ; case 'n' : sqlTabname = optarg ; break ; case 'k' : sqlkey = optarg ; break ; case 'a' : allCfg = true ; break ; case 'f' : useFits = true ; break ; case 'w' : warn = true ; break ; case '?' : cerr << "Invalid command line option " << char(optopt) << " ignored...\n" ; break ; } } /* we try to open the input file with the configuration */ ifstream inp(cfgfile.c_str(),ios::in) ; if ( !inp ) { /* If the configuration file of the -c or -C switch isn't readable, * return with an error. */ cerr << "Cannot open " << cfgfile.c_str() << " for reading\n" ; return 1 ; } else if ( cfg2cfg) { /** scan the input file provided by the -C command line switch */ altCfg conf(inp) ; /** in case there were undefined bytes in the input, insert X specifiers for these * Make sure that the output has a monotonically increasing order because this type * of information is implicit in the output file. */ conf.sortnFill() ; /** Generate the list of column specs */ cout << conf ; /* generate the stdout file with a time stamp, calling the output operator */ time_t now = time(0) ; cout << "# created from " << cfgfile << " by " << __FILE__ << " " << ctime(&now) ; } else { // cerr << "reading " << cfgfile.c_str() << endl ; char ifbase[256] ; strncpy(ifbase,cfgfile.c_str(),255) ; ifbase[255]='\0' ; // overload table name with user request if ( sqlTabname.length() ) { memset(ifbase,'\0',256) ; sqlTabname.copy(ifbase,255) ; } Cfg conf(inp, basename(ifbase), sqlkey) ; inp.close() ; /* if -f switch used (and support compiled in) */ if ( useFits) { #ifdef HAVE_CCFITS /* enforce use of upper-name column names */ for(int c=0 ; c < conf.cols.size() ; c++) conf.cols[c].dbcolname = conf.cols[c].up_name() ; /* debugging * for(int c=0 ; c < conf.cols.size() ; c++) * cerr << c << " " << conf.cols[c].dbcolname << endl ; */ /* create the table header according to the configuration */ FITS *f = conf ; /* handle lines of the standart input, one by one */ while ( !cin.eof() ) { string buf ; getline(cin,buf) ; // skip comment lines unless the 'allCfg'flag had been set. if ( !allCfg && buf[0] == '#') continue ; if ( buf.empty() ) continue ; ExtHDU & hdu = f->extension(1) ; /* append this ascii input to the row. Since the CCfits interface * uses a 1-based row enumeration, we add 1 to the current row count. */ conf.creatInsrt(hdu,buf,hdu.rows()+1L,warn) ; } delete f ; #endif } else { /* If the warning flag has been activated, call Cfg.uniq() and Cfg.keyExist() * to generate warnings. */ if( warn) { conf.uniq() ; conf.keyExist() ; } /* Create the SQL configuration commands */ cout << conf.creatDoc(argc, argv) << endl ; cout << conf.creatTbl() << ";\n" ; /* loop over the \c stdin lines to generate one feed per line into the SQL table. */ while ( !cin.eof() ) { string buf ; getline(cin,buf) ; // skip comment lines unless the 'allCfg'flag had been set. if ( !allCfg && buf[0] == '#') continue ; if ( buf.empty() ) continue ; cout << conf.creatInsrt(buf,warn) << ";\n" ; } } return 0 ; } }