00001 #ifndef TOKENIZER_IMPLEMENTATION_FILE
00002 #define TOKENIZER_IMPLEMENTATION_FILE
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "parser_bits.h"
00019 #include "tokenizer.h"
00020
00021 #include <basis/function.h>
00022 #include <basis/istring.h>
00023 #include <basis/log_base.h>
00024 #include <data_struct/stack.cpp>
00025 #include <data_struct/string_table.h>
00026 #include <data_struct/symbol_table.cpp>
00027
00028 const char *SPECIAL_VALUE = " ";
00029
00030
00031
00032 #undef LOG
00033 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger(), s)
00034
00035 tokenizer::tokenizer(int max_bits)
00036 : _implementation(new string_table(max_bits)),
00037 _assignments(new istring("=")),
00038 _separators(new istring(",")),
00039 _quotes(new istring),
00040 _nesting(false),
00041 _comments(new istring),
00042 _comment_number(1),
00043 _add_spaces(false)
00044 {}
00045
00046 tokenizer::tokenizer(const istring &separator, const istring &assignment,
00047 int max_bits)
00048 : _implementation(new string_table(max_bits)),
00049 _assignments(new istring(assignment)),
00050 _separators(new istring(separator)),
00051 _quotes(new istring),
00052 _nesting(false),
00053 _comments(new istring),
00054 _comment_number(1),
00055 _add_spaces(false)
00056 {}
00057
00058 tokenizer::tokenizer(const istring &separator, const istring &assignment,
00059 const istring "es, bool nesting, int max_bits)
00060 : _implementation(new string_table(max_bits)),
00061 _assignments(new istring(assignment)),
00062 _separators(new istring(separator)),
00063 _quotes(new istring(quotes)),
00064 _nesting(nesting),
00065 _comments(new istring),
00066 _comment_number(1),
00067 _add_spaces(false)
00068 {}
00069
00070 tokenizer::tokenizer(const tokenizer &to_copy)
00071 : object_base(),
00072 _implementation(new string_table),
00073 _assignments(new istring),
00074 _separators(new istring),
00075 _quotes(new istring),
00076 _nesting(false),
00077 _comments(new istring),
00078 _comment_number(1),
00079 _add_spaces(false)
00080 { *this = to_copy; }
00081
00082 tokenizer::~tokenizer()
00083 {
00084 WHACK(_separators);
00085 WHACK(_assignments);
00086 WHACK(_implementation);
00087 WHACK(_quotes);
00088 WHACK(_comments);
00089 }
00090
00091 int tokenizer::symbols() const { return _implementation->symbols(); }
00092
00093 void tokenizer::set_comment_chars(const istring &comments)
00094 { *_comments = comments; }
00095
00096 const istring &tokenizer::assignments() const { return *_assignments; }
00097
00098 const istring &tokenizer::separators() const { return *_separators; }
00099
00100 const istring &tokenizer::quotes() const { return *_quotes; }
00101
00102 bool tokenizer::exists(const istring &name) const
00103 { return !!_implementation->find(name); }
00104
00105 void tokenizer::reset() { _implementation->reset(); }
00106
00107 const string_table &tokenizer::table() const { return *_implementation; }
00108
00109 string_table &tokenizer::table() { return *_implementation; }
00110
00111 tokenizer &tokenizer::operator =(const tokenizer &to_copy)
00112 {
00113 if (this == &to_copy) return *this;
00114 *_implementation = *to_copy._implementation;
00115 *_separators = *to_copy._separators;
00116 *_assignments = *to_copy._assignments;
00117 *_quotes = *to_copy._quotes;
00118 _nesting = to_copy._nesting;
00119 _add_spaces = to_copy._add_spaces;
00120 return *this;
00121 }
00122
00123 istring tokenizer::find(const istring &name) const
00124 {
00125 istring *found = _implementation->find(name);
00126 if (!found) return "";
00127
00128
00129 if (*found == SPECIAL_VALUE) return "";
00130 return *found;
00131 }
00132
00133 bool tokenizer::okay_for_variable_name(char to_check) const
00134 {
00135 if (!to_check || separator(to_check) || assignment(to_check)) return false;
00136 return true;
00137 }
00138
00139 bool tokenizer::separator(char to_check) const
00140 {
00141
00142 if (parser_bits::is_eol(to_check)
00143 && (istring::matches(*_separators, '\n')
00144 || istring::matches(*_separators, '\r')) ) return true;
00145 return istring::matches(*_separators, to_check);
00146 }
00147
00148 bool tokenizer::assignment(char to_check) const
00149 { return istring::matches(*_assignments, to_check); }
00150
00151 bool tokenizer::quote_mark(char to_check) const
00152 { return istring::matches(*_quotes, to_check); }
00153
00154 bool tokenizer::comment_char(char to_check) const
00155 { return istring::matches(*_comments, to_check); }
00156
00157 #define COOL to_tokenize.length()
00158
00159
00160
00161 #define CHOP { \
00162 current = to_tokenize[0]; \
00163 to_tokenize.zap(0, 0); \
00164 }
00165
00166 void tokenizer::parse(const istring &to_tokenize_in)
00167 {
00168 FUNCDEF("parse");
00169 istring to_tokenize(to_tokenize_in);
00170
00171
00172 istring name, value;
00173 char current;
00174 bool just_ate_blank_line = false;
00175
00176
00177
00178 if (is_eol_a_separator() && parser_bits::is_eol(to_tokenize[0])) {
00179 CHOP;
00180
00181 while (COOL && parser_bits::white_space(current)) {
00182 CHOP;
00183 if (!parser_bits::white_space(current)) {
00184
00185
00186 to_tokenize.insert(0, istring(current, 1));
00187 }
00188 }
00189 }
00190
00191
00192 while (COOL) {
00193 name.reset();
00194 value.reset();
00195
00196
00197 CHOP;
00198
00199
00200 if (parser_bits::white_space_no_cr(current))
00201 continue;
00202
00203
00204 bool handle_as_comment = false;
00205 if (parser_bits::is_eol(current) && !is_eol_a_separator()) {
00206 continue;
00207 } else if (just_ate_blank_line && parser_bits::is_eol(current)) {
00208 just_ate_blank_line = false;
00209 continue;
00210 } else if (parser_bits::is_eol(current) && is_eol_a_separator()) {
00211 handle_as_comment = true;
00212 }
00213
00214 if (comment_char(current) || handle_as_comment) {
00215
00216 just_ate_blank_line = true;
00217
00218 while (COOL && !separator(current)) {
00219 value += current;
00220 CHOP;
00221 }
00222
00223 isprintf name("%s%d", STRTAB_COMMENT_PREFIX, _comment_number);
00224 _implementation->add(name, value);
00225 _comment_number++;
00226
00227 continue;
00228 }
00229
00230 just_ate_blank_line = false;
00231
00232
00233 if (!okay_for_variable_name(current)) continue;
00234
00235
00236 while (COOL && okay_for_variable_name(current)) {
00237
00238 name += current;
00239 CHOP;
00240 }
00241 if (!COOL) {
00242
00243 if (!separator(current) && !parser_bits::white_space(current) )
00244 name += current;
00245 _implementation->add(name, value);
00246 continue;
00247 }
00248
00249
00250 while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
00251
00252 bool found_assignment = false;
00253 if (assignment(current)) {
00254
00255 CHOP;
00256 found_assignment = true;
00257 }
00258
00259
00260 while (COOL && parser_bits::white_space_no_cr(current)) CHOP;
00261
00262
00263 stack<char> q_stack(!int(_nesting));
00264
00265
00266 while (COOL) {
00267
00268 bool ignore_separator = false;
00269 if (quote_mark(current)) {
00270 if (!q_stack.size()) {
00271
00272 ignore_separator = true;
00273 q_stack.push(current);
00274 } else if (current == q_stack.top()) {
00275
00276 q_stack.pop();
00277
00278
00279 if (q_stack.size())
00280 ignore_separator = true;
00281 } else {
00282
00283
00284
00285
00286 if (q_stack.kind() == stack<char>::UNBOUNDED)
00287 q_stack.push(current);
00288
00289
00290 ignore_separator = true;
00291 }
00292 } else if (q_stack.size()) {
00293
00294
00295 ignore_separator = true;
00296 }
00297
00298
00299 if (!ignore_separator && separator(current)) {
00300 break;
00301 }
00302
00303
00304 value += current;
00305 CHOP;
00306 }
00307
00308 if (!separator(current) && !parser_bits::white_space(current) ) {
00309 value += current;
00310 }
00311
00312 if (found_assignment && !value) {
00313
00314
00315 value = SPECIAL_VALUE;
00316 }
00317
00318
00319
00320 if (name.t()) {
00321
00322 while (parser_bits::white_space_no_cr(name[name.end()]))
00323 name.zap(name.end(), name.end());
00324
00325 if (value != SPECIAL_VALUE)
00326 while (parser_bits::white_space(value[value.end()]))
00327 value.zap(value.end(), value.end());
00328 _implementation->add(name, value);
00329 just_ate_blank_line = true;
00330
00331 name.reset();
00332 value.reset();
00333 }
00334 }
00335 }
00336
00337 bool tokenizer::is_eol_a_separator() const
00338 {
00339 for (int i = 0; i < _separators->length(); i++) {
00340 char sep = _separators->get(i);
00341
00342 if (parser_bits::is_eol(sep)) return true;
00343 }
00344 return false;
00345 }
00346
00347 void tokenizer::text_form(istring &accumulator) const
00348 {
00349 accumulator.reset();
00350 bool added_sep = false;
00351 for (int i = 0; i < _implementation->symbols(); i++) {
00352 added_sep = false;
00353 if (!string_table::is_comment(_implementation->name(i))) {
00354
00355 accumulator += _implementation->name(i);
00356 if (_implementation->operator [](i).t()) {
00357 if (_add_spaces) accumulator += " ";
00358 accumulator += _assignments->get(0);
00359 if (_add_spaces) accumulator += " ";
00360 accumulator += _implementation->operator [](i);
00361 }
00362 } else {
00363
00364 if (_implementation->operator [](i).t())
00365 accumulator += _implementation->operator [](i);
00366 }
00367
00368 if (is_eol_a_separator()) {
00369 accumulator += log_base::platform_ending();
00370 } else {
00371 added_sep = true;
00372 accumulator += _separators->get(0);
00373 accumulator += ' ';
00374 }
00375 }
00376
00377 if (added_sep)
00378 accumulator.zap(accumulator.end() - 1, accumulator.end());
00379 }
00380
00381 istring tokenizer::text_form() const
00382 {
00383 istring accumulator;
00384 text_form(accumulator);
00385 return accumulator;
00386 }
00387
00388
00389 #endif //TOKENIZER_IMPLEMENTATION_FILE
00390