00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #include <basis/function.h>
00030 #include <basis/guards.h>
00031 #include <basis/istring.h>
00032 #include <opsystem/application_shell.h>
00033 #include <opsystem/byte_filer.h>
00034 #include <opsystem/command_line.h>
00035 #include <loggers/file_logger.h>
00036 #include <opsystem/filename.h>
00037 #include <data_struct/static_memory_gremlin.h>
00038 #include <textual/parser_bits.h>
00039
00040 #undef BASE_LOG
00041 #define BASE_LOG(s) program_wide_logger().log(s)
00042 #undef LOG
00043 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger(), s)
00044
00045 const int MAX_FILE_SIZE = 4 * MEGABYTE;
00046
00047
00049
00050
00051 #define INCREM_N_GO { curr_index++; continue; }
00052
00053
00054 #define ADD_INTERMEDIATE \
00055 intermediate_text += full_contents[curr_index]
00056
00057 bool caseless_equals(char to_find, char comparing_with)
00058 {
00059 if (to_find == comparing_with) return true;
00060 if ( (to_find >= 'a') && (to_find <= 'z')
00061 && (to_find - ('a' - 'A') == comparing_with) ) return true;
00062 return false;
00063 }
00064
00065
00066 #define JUMP_TO_CHAR(to_find, save_them) { \
00067 while ( (curr_index < full_contents.length()) \
00068 && !caseless_equals(to_find, full_contents[curr_index]) ) { \
00069 if (save_them) ADD_INTERMEDIATE; \
00070 curr_index++; \
00071 } \
00072 }
00073
00074
00075 #define NEXT_STATE_INCREM { \
00076 state = parsing_states(state+1); \
00077 curr_index++; \
00078 continue; \
00079 }
00080
00081
00082 #define CLEAN_UP_NAUGHTY(s) { \
00083 while (s.replace("\n", " ")) {} \
00084 while (s.replace("\r", "")) {} \
00085 int indy = s.find("--"); \
00086 while (non_negative(indy)) { \
00087 s[indy] = ' '; \
00088 for (int i = indy + 1; i < s.length(); i++) { \
00089 if (s[i] != '-') break; \
00090 s.zap(i, i); \
00091 i--; \
00092 } \
00093 indy = s.find("--"); \
00094 } \
00095 while (s.replace(" ", " ")) {} \
00096 s.strip_spaces(); \
00097 }
00098
00099
00100 #define MAKE_MORE_ENGLISH(s) \
00101 s.replace_all('_', ' ')
00102
00103 void strain_out_html_codes(istring &to_edit)
00104 {
00105 for (int i = 0; i < to_edit.length(); i++) {
00106 if (to_edit[i] != '<') continue;
00107
00108 int indy = to_edit.find('>', i);
00109 if (negative(indy)) return;
00110 to_edit.zap(i, indy);
00111 i--;
00112 }
00113 }
00114
00115
00116 #define WRITE_LINK { \
00117 \
00118 CLEAN_UP_NAUGHTY(url_string); \
00119 CLEAN_UP_NAUGHTY(name_string); \
00120 while (name_string.replace("\n", " ")) {} \
00121 while (name_string.replace("\r", "")) {} \
00122 if (url_string.ends(name_string)) { \
00123 \
00124 MAKE_MORE_ENGLISH(intermediate_text); \
00125 strain_out_html_codes(intermediate_text); \
00126 CLEAN_UP_NAUGHTY(intermediate_text); \
00127 if (intermediate_text.length()) \
00128 name_string = intermediate_text; \
00129 } \
00130 \
00131 istring to_write = "\"L\",\""; \
00132 to_write += name_string; \
00133 to_write += "\",\""; \
00134 to_write += last_title; \
00135 to_write += "\",\""; \
00136 to_write += url_string; \
00137 to_write += "\"\n"; \
00138 output_file.write(to_write); \
00139 _link_count++; \
00140 }
00141
00142
00143
00144 #define WRITE_SECTION { \
00145 CLEAN_UP_NAUGHTY(last_title); \
00146 \
00147 istring to_write = "\"C\",\""; \
00148 to_write += last_title; \
00149 to_write += "\",\""; \
00150 to_write += "NOP"; \
00151 to_write += "\"\n"; \
00152 output_file.write(to_write); \
00153 _category_count++; \
00154 }
00155
00156
00157 #define RESET_STRINGS { \
00158 url_string = istring::empty_string(); \
00159 name_string = istring::empty_string(); \
00160 intermediate_text = istring::empty_string(); \
00161 }
00162
00164
00165 class link_parser : public application_shell
00166 {
00167 public:
00168 link_parser() : application_shell(static_class_name()), _link_count(0),
00169 _category_count(0) {}
00170 IMPLEMENT_CLASS_NAME("link_parser");
00171 virtual int execute();
00172 int print_instructions(const filename &program_name);
00173
00174 private:
00175 int _link_count;
00176 int _category_count;
00177
00178 istring url_string;
00179 istring name_string;
00180 istring last_title;
00181 istring intermediate_text;
00182
00183 istring heading_num;
00184
00185 };
00186
00188
00189 int link_parser::print_instructions(const filename &program_name)
00190 {
00191 isprintf to_show("%s:\n\
00192 This program needs two filenames as command line parameters. The -i flag\n\
00193 is used to specify the input filename and the -o flag specifies the output\n\
00194 file to be created. The input file is expected to be an html file\n\
00195 containing links to assorted web sites. The links are gathered, along with\n\
00196 descriptive text that happens to be near them, to create a link database in\n\
00197 the HOOPLE link format and write it to the output file. HOOPLE link format\n\
00198 is basically a CSV file that defines the columns 1-4 for describing either\n\
00199 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
00200 interest). The links are written to a CSV file in the standard HOOPLE link\n\
00201 The HOOPLE link format is documented here:\n\
00202 http://hoople.org/guides/link_database/format_manifesto.txt\n\
00203 ", program_name.basename().raw().s(), program_name.basename().raw().s());
00204 program_wide_logger().log(to_show.s());
00205 return 12;
00206 }
00207
00208 int link_parser::execute()
00209 {
00210 FUNCDEF("main");
00211 SET_DEFAULT_COMBO_LOGGER;
00212
00213 command_line cmds(__argc, __argv);
00214 istring input_filename;
00215 istring output_filename;
00216 if (!cmds.get_value('i', input_filename, false))
00217 return print_instructions(cmds.program_name());
00218 if (!cmds.get_value('o', output_filename, false))
00219 return print_instructions(cmds.program_name());
00220
00221 BASE_LOG(istring("input file: ") + input_filename);
00222 BASE_LOG(istring("output file: ") + output_filename);
00223
00224 istring full_contents;
00225 byte_filer input_file(input_filename, "r");
00226 if (!input_file.good())
00227 non_continuable_error(class_name(), func, "the input file could not be opened");
00228 input_file.read(full_contents, MAX_FILE_SIZE);
00229 input_file.close();
00230
00231 filename outname(output_filename);
00232 if (outname.exists()) {
00233 non_continuable_error(class_name(), func, istring("the output file ")
00234 + output_filename + " already exists. It would be over-written if "
00235 "we continued.");
00236 }
00237
00238 byte_filer output_file(output_filename, "w");
00239 if (!output_file.good())
00240 non_continuable_error(class_name(), func, "the output file could not be opened");
00241
00242 enum parsing_states {
00243
00244 SEEKING_LINK_START,
00245 SEEKING_HREF,
00246 GETTING_URL,
00247 SEEKING_NAME,
00248 GETTING_NAME,
00249 SEEKING_CLOSURE,
00250
00251
00252 SAW_TITLE_START,
00253 GETTING_TITLE
00254 };
00255
00256 int curr_index = 0;
00257 parsing_states state = SEEKING_LINK_START;
00258 while (curr_index < full_contents.length()) {
00259 switch (state) {
00260 case SEEKING_LINK_START:
00261
00262
00263 if (full_contents[curr_index] != '<') {
00264 ADD_INTERMEDIATE;
00265 INCREM_N_GO;
00266 }
00267
00268
00269 curr_index++;
00270 if (caseless_equals('h', full_contents[curr_index])) {
00271
00272 const char next = full_contents[curr_index + 1];
00273 if ( (next >= '0') && (next <= '9') ) {
00274
00275
00276
00277 state = SAW_TITLE_START;
00278 INCREM_N_GO;
00279 }
00280 }
00281 if (!caseless_equals('a', full_contents[curr_index])) {
00282 intermediate_text += '<';
00283 JUMP_TO_CHAR('>', true);
00284 continue;
00285 }
00286
00287 curr_index++;
00288 if (!parser_bits::white_space(full_contents[curr_index])) {
00289 intermediate_text += "<a";
00290 JUMP_TO_CHAR('>', true);
00291 continue;
00292 }
00293
00294 NEXT_STATE_INCREM;
00295 break;
00296 case SEEKING_HREF:
00297 JUMP_TO_CHAR('h', false);
00298 curr_index++;
00299 if (!caseless_equals('r', full_contents[curr_index])) continue;
00300 curr_index++;
00301 if (!caseless_equals('e', full_contents[curr_index])) continue;
00302 curr_index++;
00303 if (!caseless_equals('f', full_contents[curr_index])) continue;
00304 curr_index++;
00305 if (full_contents[curr_index] != '=') continue;
00306 curr_index++;
00307 if (full_contents[curr_index] != '"') continue;
00308
00309
00310 NEXT_STATE_INCREM;
00311 break;
00312 case GETTING_URL:
00313
00314
00315 if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
00316 url_string += full_contents[curr_index];
00317 INCREM_N_GO;
00318 break;
00319 case SEEKING_NAME:
00320 JUMP_TO_CHAR('>', false);
00321 NEXT_STATE_INCREM;
00322 break;
00323 case GETTING_NAME:
00324
00325
00326 if (full_contents[curr_index] == '<') {
00327
00328 if (full_contents[curr_index + 1] == '/')
00329 NEXT_STATE_INCREM;
00330
00331 JUMP_TO_CHAR('>', false);
00332 curr_index++;
00333 continue;
00334 }
00335 name_string += full_contents[curr_index];
00336 INCREM_N_GO;
00337 break;
00338 case SEEKING_CLOSURE:
00339 JUMP_TO_CHAR('>', false);
00340
00341 WRITE_LINK;
00342
00343 RESET_STRINGS;
00344 state = SEEKING_LINK_START;
00345 INCREM_N_GO;
00346 break;
00347 case SAW_TITLE_START:
00348 heading_num = full_contents.substring(curr_index, curr_index);
00349 JUMP_TO_CHAR('>', false);
00350 NEXT_STATE_INCREM;
00351 break;
00352 case GETTING_TITLE: {
00353 int indy = full_contents.find('<', curr_index);
00354 if (negative(indy)) {
00355 state = SEEKING_LINK_START;
00356 continue;
00357 }
00358 last_title = full_contents.substring(curr_index, indy - 1);
00359 WRITE_SECTION;
00360 JUMP_TO_CHAR('<', false);
00361 JUMP_TO_CHAR('>', false);
00362 RESET_STRINGS;
00363 state = SEEKING_LINK_START;
00364 break;
00365 }
00366 default:
00367 non_continuable_error(class_name(), func, "entered erroneous state!");
00368 }
00369 }
00370
00371 if (url_string.t()) WRITE_LINK;
00372
00373 output_file.close();
00374
00375 BASE_LOG(isprintf("wrote %d links in %d categories.", _link_count,
00376 _category_count));
00377
00378 return 0;
00379 }
00380
00382
00383 HOOPLE_MAIN(link_parser, )
00384
00385 #ifdef __BUILD_STATIC_APPLICATION__
00386
00387 #include <basis/array.cpp>
00388 #include <basis/byte_array.cpp>
00389 #include <basis/callstack_tracker.cpp>
00390 #include <basis/chaos.cpp>
00391 #include <basis/convert_utf.cpp>
00392 #include <basis/definitions.cpp>
00393 #include <basis/earth_time.cpp>
00394 #include <basis/guards.cpp>
00395 #include <basis/istring.cpp>
00396 #include <basis/log_base.cpp>
00397 #include <basis/memory_checker.cpp>
00398 #include <basis/mutex.cpp>
00399 #include <basis/object_base.cpp>
00400 #include <basis/outcome.cpp>
00401 #include <basis/packable.cpp>
00402 #include <basis/portable.cpp>
00403 #include <basis/sequence.cpp>
00404 #include <basis/set.cpp>
00405 #include <basis/utility.cpp>
00406 #include <basis/version_record.cpp>
00407 #include <data_struct/amorph.cpp>
00408 #include <data_struct/bit_vector.cpp>
00409 #include <data_struct/byte_hasher.cpp>
00410 #include <data_struct/configurator.cpp>
00411 #include <data_struct/hash_table.cpp>
00412 #include <data_struct/pointer_hash.cpp>
00413 #include <data_struct/stack.cpp>
00414 #include <data_struct/static_memory_gremlin.cpp>
00415 #include <data_struct/string_hash.cpp>
00416 #include <data_struct/string_hasher.cpp>
00417 #include <data_struct/string_table.cpp>
00418 #include <data_struct/symbol_table.cpp>
00419 #include <data_struct/table_configurator.cpp>
00420 #include <loggers/console_logger.cpp>
00421 #include <loggers/file_logger.cpp>
00422 #include <loggers/locked_logger.cpp>
00423 #include <loggers/null_logger.cpp>
00424 #include <loggers/program_wide_logger.cpp>
00425 #include <opsystem/application_base.cpp>
00426 #include <opsystem/application_shell.cpp>
00427 #include <opsystem/byte_filer.cpp>
00428 #include <opsystem/command_line.cpp>
00429 #include <opsystem/critical_events.cpp>
00430 #include <opsystem/directory.cpp>
00431 #include <opsystem/filename.cpp>
00432 #include <opsystem/ini_config.cpp>
00433 #include <opsystem/ini_parser.cpp>
00434 #include <opsystem/path_configuration.cpp>
00435 #include <opsystem/rendezvous.cpp>
00436 #include <textual/byte_format.cpp>
00437 #include <textual/parser_bits.cpp>
00438 #include <textual/string_manipulation.cpp>
00439 #include <textual/tokenizer.cpp>
00440 #endif // __BUILD_STATIC_APPLICATION__
00441