list_parsing.cpp

Go to the documentation of this file.
00001 #ifndef LIST_PARSING_IMPLEMENTATION_FILE
00002 #define LIST_PARSING_IMPLEMENTATION_FILE
00003 
00004 /*****************************************************************************\
00005 *                                                                             *
00006 *  Name   : list_parsing                                                      *
00007 *  Author : Chris Koeritz                                                     *
00008 *  Author : Gary Hardley                                                      *
00009 *  Author : Brit Minor                                                        *
00010 *  Author : Aaron Buchanan                                                    *
00011 *                                                                             *
00012 *******************************************************************************
00013 * Copyright (c) 2002-$now By Author.  This program is free software; you can  *
00014 * redistribute it and/or modify it under the terms of the GNU General Public  *
00015 * License as published by the Free Software Foundation; either version 2 of   *
00016 * the License or (at your option) any later version.  This is online at:      *
00017 *     http://www.fsf.org/copyleft/gpl.html                                    *
00018 * Please send any updates to: fred@gruntose.com                               *
00019 \*****************************************************************************/
00020 
00021 #include "list_parsing.h"
00022 #include "parser_bits.h"
00023 
00024 #include <basis/istring.h>
00025 #include <basis/log_base.h>
00026 #include <basis/set.cpp>
00027 #include <data_struct/string_table.h>
00028 
00029 #include <ctype.h>
00030 
00031 #undef LOG
00032 #define LOG(to_print) CLASS_EMERGENCY_LOG(program_wide_logger(), to_print)
00033 
00034 list_parsing::~list_parsing() {}  // needed since we use the class_name macro.
00035 
00036 // by Gary Hardley.
00037 bool list_parsing::get_ids_from_string(const istring &to_parse,
00038     int_set &identifiers)
00039 {
00040   identifiers.clear();  // clear existing ids, if any.
00041   int_array found;
00042   bool ret = get_ids_from_string(to_parse, found);
00043   if (!ret) return false;
00044   for (int i = 0; i < found.length(); i++) identifiers.add(found[i]);
00045   return true;
00046 }
00047 
00048 // by Gary Hardley.
00049 bool list_parsing::get_ids_from_string(const istring &to_parse,
00050     int_array &identifiers)
00051 {
00052   identifiers.reset();  // clear existing ids, if any.
00053   if (!to_parse) return false;
00054     // if an empty string is passed, return an empty set.
00055 
00056   int last_id = -1;
00057   int tmp_id;
00058   bool done = false;
00059   char last_separator = ' ';
00060 
00061   int index = 0;
00062   while (!done && (index < to_parse.length())) {
00063     tmp_id = 0;
00064     bool got_digit = false;
00065     while ( (to_parse[index] != ',') && (to_parse[index] != '-')
00066         && (to_parse[index] != ' ') && (index < to_parse.length()) ) {
00067       if (!isdigit(to_parse[index])) return false;
00068       tmp_id *= 10;
00069       tmp_id += int(to_parse[index++]) - 0x30;
00070       got_digit = true;
00071     }
00072 
00073     if (got_digit) {
00074       if (tmp_id > MAXINT) return false;
00075 
00076       if (last_id == -1) {
00077         last_id = tmp_id;
00078         identifiers += last_id;
00079       } else {
00080         // if the last separator was a dash, this is a range
00081         if (last_separator == '-') {
00082           if (tmp_id >= last_id) {
00083             for (int i = last_id + 1; i <= tmp_id; i++) 
00084                 identifiers += i;
00085           }
00086           else {
00087             for (int i = tmp_id; i < last_id; i++) 
00088                 identifiers += i;
00089           }
00090           last_id = 0;
00091           last_separator = ' ';
00092         } else {
00093           last_id = tmp_id;
00094           identifiers += last_id;
00095         }
00096       }
00097     } else {
00098       // did not read an address, to_parse[index] must be a non-digit.
00099       if ( (to_parse[index] != ' ') && (to_parse[index] != '-')
00100           && (to_parse[index] != ',') ) return false;
00101       last_separator = to_parse[index++];
00102     }
00103   }
00104   return true;
00105 }
00106 
00107 //by chris koeritz.
00108 istring list_parsing::put_ids_in_string(const int_set &ids, char separator)
00109 {
00110   istring to_return;
00111   for (int i = 0; i < ids.length(); i++) {
00112     to_return += isprintf("%d", ids[i]);
00113     if (i < ids.length() - 1) {
00114       to_return += separator;
00115       to_return += " ";
00116     }
00117   }
00118   return to_return;
00119 }
00120 
00121 //by chris koeritz.
00122 istring list_parsing::put_ids_in_string(const int_array &ids, char separator)
00123 {
00124   istring to_return;
00125   for (int i = 0; i < ids.length(); i++) {
00126     to_return += isprintf("%d", ids[i]);
00127     if (i < ids.length() - 1) {
00128       to_return += separator;
00129       to_return += " ";
00130     }
00131   }
00132   return to_return;
00133 }
00134 
00135 #define ADD_TO_VALUES() \
00136     string.substring(value, start, end - 1); \
00137     if( true == strip_spaces ) value.strip_spaces(); \
00138     if( value.length() > 0 ) values += value;
00139 
00140 bool list_parsing::get_values_from_string(const istring &string,
00141     string_array &values, const char separator, const bool strip_spaces)
00142 {
00143     values.reset();
00144     const int strlen = string.length();
00145     int start = 0;
00146     int end = -1;
00147     istring value;
00148     while( (end = string.find(separator, start)) >= 0 )
00149     {
00150         ADD_TO_VALUES();
00151         start = end + 1;
00152     }
00153     if( start < strlen )    // get the last entry 
00154     {
00155         end = strlen;
00156         ADD_TO_VALUES();
00157     }
00158     return values.length() > 0;
00159 }
00160 
00161 #define LOC_VALID(loc) if( (loc) < 0 ) return false;
00162 
00163 bool list_parsing::get_separated_value(const istring &string, 
00164                                        const istring &name, 
00165                                        istring &value, const byte assign, 
00166                                        const byte separator)
00167 {
00168 //    const int len = string.length();
00169     const int start_pos = string.ifind(name);
00170     LOC_VALID(start_pos);
00171 
00172     const int assign_pos = string.find(assign, start_pos + name.length());
00173     LOC_VALID(assign_pos);
00174 
00175     const int sep_pos = string.find(separator, assign_pos + 1);
00176     LOC_VALID(sep_pos);
00177 
00178     value = string.substring(assign_pos + 1, sep_pos - 1);
00179     return true;
00180 }
00181 
00182 bool list_parsing::get_rest_of_line(const istring &string, 
00183                                     const istring &name, istring &value,
00184                                     const byte assign)
00185 {
00186     const int len = string.length();
00187 
00188     const int start_pos = string.ifind(name);
00189     LOC_VALID(start_pos);
00190 
00191     const int assign_pos = string.find(assign, start_pos + name.length());
00192     LOC_VALID(assign_pos);
00193 
00194     value = string.substring(assign_pos + 1, len - 1);
00195     return true;
00196 }
00197 
00198 int list_parsing::get_positions(const istring &string, const char separator, 
00199                                 int_array &positions)
00200 {
00201     positions.reset();
00202     int pos = -1;
00203     while( (pos = string.find(separator, pos + 1)) >= 0 )
00204         positions += pos;
00205     return positions.length();
00206 }
00207 
00208 //hmmm: need a method that escapes quotes within strings.
00209 
00210 // ensures that quotes inside the string "to_emit" are escaped.
00211 istring list_parsing::emit_quoted_chunk(const istring &to_emit)
00212 {
00213   istring to_return('\0', 256);  // avoid reallocations with large pre-alloc.
00214   to_return = "";  // reset to get blank string but keep pre-alloc.
00215   for (int i = 0; i < to_emit.length(); i++) {
00216     char next_char = to_emit[i];
00217     if (next_char == '"') to_return += "\\";  // add the escape before quote.
00218     to_return += istring(next_char, 1);
00219   }  
00220   return to_return;
00221 }
00222 
00223 void list_parsing::create_csv_line(const string_table &to_csv, istring &target)
00224 {
00225   target = istring::empty_string();
00226   for (int i = 0; i < to_csv.symbols(); i++) {
00227     target += istring("\"") + emit_quoted_chunk(to_csv.name(i))
00228         + "=" + emit_quoted_chunk(to_csv[i]) + "\"";
00229     if (i < to_csv.symbols() - 1) target += ",";
00230   }
00231 }
00232 
00233 void list_parsing::create_csv_line(const string_array &to_csv, istring &target)
00234 {
00235   target = istring::empty_string();
00236   for (int i = 0; i < to_csv.length(); i++) {
00237     target += istring("\"") + emit_quoted_chunk(to_csv[i]) + "\"";
00238     if (i < to_csv.length() - 1) target += ",";
00239   }
00240 }
00241 
00242 // we do handle escaped quotes for csv parsing, so check for backslash.
00243 #define handle_escapes \
00244   if (to_parse[i] == '\\') { \
00245     if (to_parse[i + 1] == '"') { \
00246       i++; \
00247       accumulator += to_parse[i]; \
00248       continue; /* skip normal handling in sequel. */ \
00249     } \
00250   }
00251 
00252 const int ARRAY_PREFILL_AMOUNT = 7;
00253   // a random default for pre-filling.
00254 
00255 #define ADD_LINE_TO_FIELDS(new_line) { \
00256   storage_slot++;  /* move to next place to store item. */ \
00257   /* make sure we have enough space for the next slot and then some. */ \
00258 /*LOG(isprintf("fields curr=%d stowslot=%d", fields.length(), storage_slot));*/ \
00259   if (fields.length() < storage_slot + 2) \
00260     fields.insert(fields.length(), ARRAY_PREFILL_AMOUNT); \
00261 /*LOG(isprintf("now fields=%d stowslot=%d", fields.length(), storage_slot));*/ \
00262   fields[storage_slot] = new_line; \
00263 }
00264 
00265 //hmmm: parameterize what is meant by a quote.  maybe comma too.
00266 //by chris koeritz.
00267 bool list_parsing::parse_csv_line(const istring &to_parse, string_array &fields)
00268 {
00269   FUNCDEF("parse_csv_line");
00270   // the current field we're chowing.  we puff it out to start with to
00271   // avoid paying for expanding its memory later.
00272   istring accumulator(' ', 256);
00273   accumulator = istring::empty_string();
00274 
00275   // the state machine goes through these states until the entire string
00276   // is consumed.
00277   enum states { seeking_quote, eating_string, seeking_comma };
00278   states state = seeking_quote;
00279 
00280   bool no_second_quote = false;  // true if we started without a quote.
00281   bool just_saw_comma = false;  // true if seeking comma was the last state.
00282 
00283   int storage_slot = -1;
00284 
00285   for (int i = 0; i < to_parse.length(); i++) {
00286     switch (state) {
00287       case seeking_quote:
00288         if (parser_bits::white_space(to_parse[i])) continue;
00289         if (to_parse[i] == ',') {
00290           // a missing quoted string counts as an empty string.
00291           ADD_LINE_TO_FIELDS(istring::empty_string());
00292           just_saw_comma = true;
00293           continue;
00294         }
00295         just_saw_comma = false;  // cancel that state.
00296         if (to_parse[i] != '"') {
00297           // short circuit the need for a quote.
00298           accumulator += to_parse[i];
00299           no_second_quote = true;
00300         }
00301         state = eating_string;
00302         break;
00303       case eating_string:
00304         just_saw_comma = false;  // no longer true.
00305         if (no_second_quote && (to_parse[i] != ',') ) {
00306           handle_escapes;
00307           accumulator += to_parse[i];
00308         } else if (!no_second_quote && (to_parse[i] != '"') ) {
00309           handle_escapes;
00310           accumulator += to_parse[i];
00311         } else {
00312           // we found the closing quote (or comma).  add the string.
00313           if (no_second_quote) {
00314             state = seeking_quote;
00315             just_saw_comma = true;
00316           } else state = seeking_comma;
00317           ADD_LINE_TO_FIELDS(accumulator)
00318           accumulator = istring::empty_string();
00319           no_second_quote = false;
00320         }
00321         break;
00322       case seeking_comma:
00323         if (parser_bits::white_space(to_parse[i])) continue;
00324         if (to_parse[i] == ',') {
00325           // we got what we wanted.
00326           state = seeking_quote;
00327           just_saw_comma = true;
00328           continue;
00329         }
00330         // well, there was no comma.  that's an error.
00331         return false;
00332         break;
00333       default:
00334         LOG("erroneous state reached during csv parsing");
00335         break;
00336     }
00337   }
00338   if ( (state == eating_string) && (accumulator.length()) )
00339     ADD_LINE_TO_FIELDS(accumulator)
00340   else if (just_saw_comma)
00341     ADD_LINE_TO_FIELDS(istring::empty_string())
00342   if (fields.length() > storage_slot + 1)
00343     fields.zap(storage_slot + 1, fields.last());
00344   return true;
00345 }
00346 
00349 
00350 
00351 #endif //LIST_PARSING_IMPLEMENTATION_FILE
00352 

Generated on Fri Oct 10 04:28:57 2008 for HOOPLE Libraries by  doxygen 1.5.1