tokenizer.cpp

Go to the documentation of this file.
00001 #ifndef TOKENIZER_IMPLEMENTATION_FILE
00002 #define TOKENIZER_IMPLEMENTATION_FILE
00003 
00004 /*****************************************************************************\
00005 *                                                                             *
00006 *  Name   : tokenizer                                                         *
00007 *  Author : Chris Koeritz                                                     *
00008 *                                                                             *
00009 *******************************************************************************
00010 * Copyright (c) 1997-$now By Author.  This program is free software; you can  *
00011 * redistribute it and/or modify it under the terms of the GNU General Public  *
00012 * License as published by the Free Software Foundation; either version 2 of   *
00013 * the License or (at your option) any later version.  This is online at:      *
00014 *     http://www.fsf.org/copyleft/gpl.html                                    *
00015 * Please send any updates to: fred@gruntose.com                               *
00016 \*****************************************************************************/
00017 
00018 #include "parser_bits.h"
00019 #include "tokenizer.h"
00020 
00021 #include <basis/function.h>
00022 #include <basis/istring.h>
00023 #include <basis/log_base.h>
00024 #include <data_struct/stack.cpp>
00025 #include <data_struct/string_table.h>
00026 #include <data_struct/symbol_table.cpp>
00027 
00028 const char *SPECIAL_VALUE = " ";
00029   // special value stored for entries with assignment operators but no
00030   // value contents.
00031 
00032 #undef LOG
00033 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger(), s)
00034 
00035 tokenizer::tokenizer(int max_bits)
00036 : _implementation(new string_table(max_bits)),
00037   _assignments(new istring("=")),
00038   _separators(new istring(",")),
00039   _quotes(new istring),
00040   _nesting(false),
00041   _comments(new istring),
00042   _comment_number(1),
00043   _add_spaces(false)
00044 {}
00045 
00046 tokenizer::tokenizer(const istring &separator, const istring &assignment,
00047     int max_bits)
00048 : _implementation(new string_table(max_bits)),
00049   _assignments(new istring(assignment)),
00050   _separators(new istring(separator)),
00051   _quotes(new istring),
00052   _nesting(false),
00053   _comments(new istring),
00054   _comment_number(1),
00055   _add_spaces(false)
00056 {}
00057 
00058 tokenizer::tokenizer(const istring &separator, const istring &assignment,
00059     const istring &quotes, bool nesting, int max_bits)
00060 : _implementation(new string_table(max_bits)),
00061   _assignments(new istring(assignment)),
00062   _separators(new istring(separator)),
00063   _quotes(new istring(quotes)),
00064   _nesting(nesting),
00065   _comments(new istring),
00066   _comment_number(1),
00067   _add_spaces(false)
00068 {}
00069 
00070 tokenizer::tokenizer(const tokenizer &to_copy)
00071 : object_base(),
00072   _implementation(new string_table),
00073   _assignments(new istring),
00074   _separators(new istring),
00075   _quotes(new istring),
00076   _nesting(false),
00077   _comments(new istring),
00078   _comment_number(1),
00079   _add_spaces(false)
00080 { *this = to_copy; }
00081 
00082 tokenizer::~tokenizer()
00083 {
00084   WHACK(_separators);
00085   WHACK(_assignments);
00086   WHACK(_implementation);
00087   WHACK(_quotes);
00088   WHACK(_comments);
00089 }
00090 
00091 int tokenizer::symbols() const { return _implementation->symbols(); }
00092 
00093 void tokenizer::set_comment_chars(const istring &comments)
00094 { *_comments = comments; }
00095 
00096 const istring &tokenizer::assignments() const { return *_assignments; }
00097 
00098 const istring &tokenizer::separators() const { return *_separators; }
00099 
00100 const istring &tokenizer::quotes() const { return *_quotes; }
00101 
00102 bool tokenizer::exists(const istring &name) const
00103 { return !!_implementation->find(name); }
00104 
00105 void tokenizer::reset() { _implementation->reset(); }
00106 
00107 const string_table &tokenizer::table() const { return *_implementation; }
00108 
00109 string_table &tokenizer::table() { return *_implementation; }
00110 
00111 tokenizer &tokenizer::operator =(const tokenizer &to_copy)
00112 {
00113   if (this == &to_copy) return *this;
00114   *_implementation = *to_copy._implementation;
00115   *_separators = *to_copy._separators;
00116   *_assignments = *to_copy._assignments;
00117   *_quotes = *to_copy._quotes;
00118   _nesting = to_copy._nesting;
00119   _add_spaces = to_copy._add_spaces;
00120   return *this;
00121 }
00122 
00123 istring tokenizer::find(const istring &name) const
00124 {
00125   istring *found = _implementation->find(name);
00126   if (!found) return "";
00127 
00128   // check that the contents are not just our significator of emptiness.
00129   if (*found == SPECIAL_VALUE) return "";
00130   return *found;
00131 }
00132 
00133 bool tokenizer::okay_for_variable_name(char to_check) const
00134 {
00135   if (!to_check || separator(to_check) || assignment(to_check)) return false;
00136   return true;
00137 }
00138 
00139 bool tokenizer::separator(char to_check) const
00140 {
00141   // special case allows a CR separator to be either flavor.
00142   if (parser_bits::is_eol(to_check)
00143       && (istring::matches(*_separators, '\n')
00144            || istring::matches(*_separators, '\r')) ) return true;
00145   return istring::matches(*_separators, to_check);
00146 }
00147 
00148 bool tokenizer::assignment(char to_check) const
00149 { return istring::matches(*_assignments, to_check); }
00150 
00151 bool tokenizer::quote_mark(char to_check) const
00152 { return istring::matches(*_quotes, to_check); }
00153 
00154 bool tokenizer::comment_char(char to_check) const
00155 { return istring::matches(*_comments, to_check); }
00156 
00157 #define COOL to_tokenize.length()
00158   // true if the string should continue to be parsed.
00159 
00160 // sets "current" to the first character in the string.
00161 #define CHOP { \
00162   current = to_tokenize[0]; \
00163   to_tokenize.zap(0, 0); \
00164 }
00165 
00166 void tokenizer::parse(const istring &to_tokenize_in)
00167 {
00168   FUNCDEF("parse");
00169   istring to_tokenize(to_tokenize_in);  // de-const.
00170 //hmmm: do we need a copy?  try scooting based on a current pos.
00171 
00172   istring name, value;  // accumulated during the loop.
00173   char current;  // the most recent character from to_tokenize.
00174   bool just_ate_blank_line = false;
00175     // records when we handle a blank line as a comment.
00176 
00177   // pre-processing to remove extra eols and white space in front.
00178   if (is_eol_a_separator() && parser_bits::is_eol(to_tokenize[0])) {
00179     CHOP;
00180     // chop any white space but don't eat any non-white space coming up.
00181     while (COOL && parser_bits::white_space(current)) {
00182       CHOP;
00183       if (!parser_bits::white_space(current)) {
00184         // oops; we ate something we shouldn't have, since it will be
00185         // chopped when we get in the main loop.
00186         to_tokenize.insert(0, istring(current, 1));
00187       }
00188     }
00189   }
00190 
00191   // loop over the string.
00192   while (COOL) {
00193     name.reset();
00194     value.reset();
00195 
00196     // chop the first character off for analysis.
00197     CHOP;
00198 
00199     // ignore any white space until we hit a variable or other good stuff.
00200     if (parser_bits::white_space_no_cr(current))
00201       continue;
00202 
00203     // ignore eol unless they are in separator list.
00204     bool handle_as_comment = false;
00205     if (parser_bits::is_eol(current) && !is_eol_a_separator()) {
00206       continue;
00207     } else if (just_ate_blank_line && parser_bits::is_eol(current)) {
00208       just_ate_blank_line = false;
00209       continue;
00210     } else if (parser_bits::is_eol(current) && is_eol_a_separator()) {
00211       handle_as_comment = true;
00212     }
00213 
00214     if (comment_char(current) || handle_as_comment) {
00215       // set our flag since we are going to eat the end of line in any case.
00216       just_ate_blank_line = true;
00217       // seek all text until next separator.
00218       while (COOL && !separator(current)) {
00219         value += current;
00220         CHOP;
00221       }
00222       // add the item with our ongoing comment number.
00223       isprintf name("%s%d", STRTAB_COMMENT_PREFIX, _comment_number);
00224       _implementation->add(name, value);
00225       _comment_number++;  // go to next comment number to keep unique.
00226 //LOG(istring("got comment: ") + name + " -> " + value);
00227       continue;  // got our chunk, keep going.
00228     }
00229 
00230     just_ate_blank_line = false;  // reset our flag.
00231 
00232     // skip characters we can't use for a variable name.
00233     if (!okay_for_variable_name(current)) continue;
00234 
00235     // we've found the start of a variable.
00236     while (COOL && okay_for_variable_name(current)) {
00237       // accumulate the variable name.
00238       name += current;
00239       CHOP;  // get the next character.
00240     }
00241     if (!COOL) {
00242       // we're at the end of the line, so deal with this situation.
00243       if (!separator(current) && !parser_bits::white_space(current) )
00244         name += current;  // get the character from the end of the line.
00245       _implementation->add(name, value);  // store what we built.
00246       continue;  // skip the rest; we're at the END of the line man.
00247     }
00248 
00249     // skip spaces after variable name.
00250     while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
00251 
00252     bool found_assignment = false;  // assume there isn't one.
00253     if (assignment(current)) {
00254       // we found the assignment operator and are starting on the value.
00255       CHOP;  // skip the assignment operator.
00256       found_assignment = true;
00257     }
00258 
00259     // skip spaces after the assignment statement.
00260     while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
00261 
00262     // track the quoting that we have to deal with in parsing a value.
00263     stack<char> q_stack(!int(_nesting));
00264       // create an unbounded stack for nesting.
00265 
00266     while (COOL) {
00267       // check if the current character is a quote.
00268       bool ignore_separator = false;
00269       if (quote_mark(current)) {
00270         if (!q_stack.size()) {
00271           // nothing on the stack yet, so start accumulating.
00272           ignore_separator = true;
00273           q_stack.push(current);
00274         } else if (current == q_stack.top()) {
00275           // we got the end of this quoting.
00276           q_stack.pop();
00277           // check if we're done with any quotes.  if not, we still need to
00278           // ignore the separators.
00279           if (q_stack.size())
00280             ignore_separator = true;
00281         } else {
00282           // if we are using a bounded stack, it means we only support one
00283           // level of quoting at a time.  thus, this quote character simply
00284           // falls in as a regular character.  but if we're unbound, then
00285           // we can nest arbitrary levels of quotes.
00286           if (q_stack.kind() == stack<char>::UNBOUNDED)
00287             q_stack.push(current);
00288           // we have something on the stack already so we're still ignoring
00289           // separators.  we just don't care about this type of quote.
00290           ignore_separator = true;
00291         }
00292       } else if (q_stack.size()) {
00293         // it's not a quote but we're still trying to chow the matching
00294         // quote character.
00295         ignore_separator = true;
00296       }
00297 
00298       // look for the separator.
00299       if (!ignore_separator && separator(current)) {
00300         break;
00301       }
00302 
00303       // accumulate the value.
00304       value += current;
00305       CHOP;  // get the next character.
00306     }
00307     // get the last character if it's relevant.
00308     if (!separator(current) && !parser_bits::white_space(current) ) {
00309       value += current;
00310     }
00311 
00312     if (found_assignment && !value) {
00313       // use our special case for empty values, since there was an assignment
00314       // operator but no value afterwards.
00315       value = SPECIAL_VALUE;
00316     }
00317 
00318     // store the accumulated variable name and value, but only if the name
00319     // is non-empty.  otherwise, it's not much of a definition.
00320     if (name.t()) {
00321       // strip spaces at the end of the name.
00322       while (parser_bits::white_space_no_cr(name[name.end()]))
00323         name.zap(name.end(), name.end());
00324       // strip spaces at the end of the value unless it's the special case.
00325       if (value != SPECIAL_VALUE)
00326         while (parser_bits::white_space(value[value.end()]))
00327           value.zap(value.end(), value.end());
00328       _implementation->add(name, value);  // store what we built.
00329       just_ate_blank_line = true;  // flag that we don't want next EOL.
00330       // reset, just in case.
00331       name.reset();
00332       value.reset();
00333     }
00334   }
00335 }
00336 
00337 bool tokenizer::is_eol_a_separator() const
00338 {
00339   for (int i = 0; i < _separators->length(); i++) {
00340     char sep = _separators->get(i);
00341     // correct the separator for platform when it's the end of the line.
00342     if (parser_bits::is_eol(sep)) return true;
00343   }
00344   return false;
00345 }
00346 
00347 void tokenizer::text_form(istring &accumulator) const
00348 {
00349   accumulator.reset();
00350   bool added_sep = false;
00351   for (int i = 0; i < _implementation->symbols(); i++) {
00352     added_sep = false;
00353     if (!string_table::is_comment(_implementation->name(i))) {
00354       // a normal assignment is here.
00355       accumulator += _implementation->name(i);
00356       if (_implementation->operator [](i).t()) {
00357         if (_add_spaces) accumulator += " ";
00358         accumulator += _assignments->get(0);
00359         if (_add_spaces) accumulator += " ";
00360         accumulator += _implementation->operator [](i);
00361       }
00362     } else {
00363       // this one is a comment.  just spit out the value.
00364       if (_implementation->operator [](i).t())
00365         accumulator += _implementation->operator [](i);
00366     }
00367     // correct the separator for platform when it's the end of the line.
00368     if (is_eol_a_separator()) {
00369       accumulator += log_base::platform_ending();
00370     } else {
00371       added_sep = true;  // record that we put a separator in there.
00372       accumulator += _separators->get(0);
00373       accumulator += ' ';
00374     }
00375   }
00376   // strip the final separator and space back off, if we added them.
00377   if (added_sep)
00378     accumulator.zap(accumulator.end() - 1, accumulator.end());
00379 }
00380 
00381 istring tokenizer::text_form() const
00382 {
00383   istring accumulator;
00384   text_form(accumulator);
00385   return accumulator;
00386 }
00387 
00388 
00389 #endif //TOKENIZER_IMPLEMENTATION_FILE
00390 

Generated on Fri Nov 21 04:29:59 2008 for HOOPLE Libraries by  doxygen 1.5.1