00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029 #include <application/hoople_main.h>
00030 #include <basis/astring.h>
00031 #include <basis/functions.h>
00032 #include <basis/guards.h>
00033 #include <filesystem/byte_filer.h>
00034 #include <filesystem/filename.h>
00035 #include <loggers/critical_events.h>
00036 #include <loggers/file_logger.h>
00037 #include <structures/static_memory_gremlin.h>
00038 #include <textual/parser_bits.h>
00039
00040 using namespace application;
00041 using namespace basis;
00042 using namespace filesystem;
00043 using namespace loggers;
00044 using namespace structures;
00045 using namespace textual;
00046
00047 #undef BASE_LOG
00048 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
00049 #undef LOG
00050 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
00051
00052 const int MAX_FILE_SIZE = 4 * MEGABYTE;
00053
00054
00056
00057
00058 #define INCREM_N_GO { curr_index++; continue; }
00059
00060
00061 #define ADD_INTERMEDIATE \
00062 intermediate_text += full_contents[curr_index]
00063
00064 bool caseless_equals(char to_find, char comparing_with)
00065 {
00066 if (to_find == comparing_with) return true;
00067 if ( (to_find >= 'a') && (to_find <= 'z')
00068 && (to_find - ('a' - 'A') == comparing_with) ) return true;
00069 return false;
00070 }
00071
00072
00073 #define JUMP_TO_CHAR(to_find, save_them) { \
00074 while ( (curr_index < full_contents.length()) \
00075 && !caseless_equals(to_find, full_contents[curr_index]) ) { \
00076 if (save_them) ADD_INTERMEDIATE; \
00077 curr_index++; \
00078 } \
00079 }
00080
00081
00082 #define NEXT_STATE_INCREM { \
00083 state = parsing_states(state+1); \
00084 curr_index++; \
00085 continue; \
00086 }
00087
00088
00089 #define CLEAN_UP_NAUGHTY(s) { \
00090 while (s.replace("\n", " ")) {} \
00091 while (s.replace("\r", "")) {} \
00092 int indy = s.find("--"); \
00093 while (non_negative(indy)) { \
00094 s[indy] = ' '; \
00095 for (int i = indy + 1; i < s.length(); i++) { \
00096 if (s[i] != '-') break; \
00097 s.zap(i, i); \
00098 i--; \
00099 } \
00100 indy = s.find("--"); \
00101 } \
00102 while (s.replace(" ", " ")) {} \
00103 s.strip_spaces(); \
00104 }
00105
00106
00107 #define MAKE_MORE_ENGLISH(s) \
00108 s.replace_all('_', ' ')
00109
00110 void strain_out_html_codes(astring &to_edit)
00111 {
00112 for (int i = 0; i < to_edit.length(); i++) {
00113 if (to_edit[i] != '<') continue;
00114
00115 int indy = to_edit.find('>', i);
00116 if (negative(indy)) return;
00117 to_edit.zap(i, indy);
00118 i--;
00119 }
00120 }
00121
00122
00123 #define WRITE_LINK { \
00124 \
00125 CLEAN_UP_NAUGHTY(url_string); \
00126 CLEAN_UP_NAUGHTY(name_string); \
00127 while (name_string.replace("\n", " ")) {} \
00128 while (name_string.replace("\r", "")) {} \
00129 if (url_string.ends(name_string)) { \
00130 \
00131 MAKE_MORE_ENGLISH(intermediate_text); \
00132 strain_out_html_codes(intermediate_text); \
00133 CLEAN_UP_NAUGHTY(intermediate_text); \
00134 if (intermediate_text.length()) \
00135 name_string = intermediate_text; \
00136 } \
00137 \
00138 astring to_write = "\"L\",\""; \
00139 to_write += name_string; \
00140 to_write += "\",\""; \
00141 to_write += last_title; \
00142 to_write += "\",\""; \
00143 to_write += url_string; \
00144 to_write += "\"\n"; \
00145 output_file.write(to_write); \
00146 _link_count++; \
00147 }
00148
00149
00150
00151 #define WRITE_SECTION { \
00152 CLEAN_UP_NAUGHTY(last_title); \
00153 \
00154 astring to_write = "\"C\",\""; \
00155 to_write += last_title; \
00156 to_write += "\",\""; \
00157 to_write += "Root"; \
00158 to_write += "\"\n"; \
00159 output_file.write(to_write); \
00160 _category_count++; \
00161 }
00162
00163
00164 #define RESET_STRINGS { \
00165 url_string = astring::empty_string(); \
00166 name_string = astring::empty_string(); \
00167 intermediate_text = astring::empty_string(); \
00168 }
00169
00171
00172 class link_parser : public application_shell
00173 {
00174 public:
00175 link_parser() : application_shell(), _link_count(0), _category_count(0) {}
00176 DEFINE_CLASS_NAME("link_parser");
00177 virtual int execute();
00178 int print_instructions(const filename &program_name);
00179
00180 private:
00181 int _link_count;
00182 int _category_count;
00183
00184 astring url_string;
00185 astring name_string;
00186 astring last_title;
00187 astring intermediate_text;
00188
00189 astring heading_num;
00190
00191 };
00192
00194
00195 int link_parser::print_instructions(const filename &program_name)
00196 {
00197 a_sprintf to_show("%s:\n\
00198 This program needs two filenames as command line parameters. The -i flag\n\
00199 is used to specify the input filename and the -o flag specifies the output\n\
00200 file to be created. The input file is expected to be an html file\n\
00201 containing links to assorted web sites. The links are gathered, along with\n\
00202 descriptive text that happens to be near them, to create a link database in\n\
00203 the HOOPLE link format and write it to the output file. HOOPLE link format\n\
00204 is basically a CSV file that defines the columns 1-4 for describing either\n\
00205 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
00206 interest). The links are written to a CSV file in the standard HOOPLE link\n\
00207 The HOOPLE link format is documented here:\n\
00208 http://hoople.org/guides/link_database/format_manifesto.txt\n\
00209 ", program_name.basename().raw().s(), program_name.basename().raw().s());
00210 program_wide_logger::get().log(to_show, ALWAYS_PRINT);
00211 return 12;
00212 }
00213
00214 int link_parser::execute()
00215 {
00216 FUNCDEF("main");
00217 command_line cmds(_global_argc, _global_argv);
00218 astring input_filename;
00219 astring output_filename;
00220 if (!cmds.get_value('i', input_filename, false))
00221 return print_instructions(cmds.program_name());
00222 if (!cmds.get_value('o', output_filename, false))
00223 return print_instructions(cmds.program_name());
00224
00225 BASE_LOG(astring("input file: ") + input_filename);
00226 BASE_LOG(astring("output file: ") + output_filename);
00227
00228 astring full_contents;
00229 byte_filer input_file(input_filename, "r");
00230 if (!input_file.good())
00231 non_continuable_error(class_name(), func, "the input file could not be opened");
00232 input_file.read(full_contents, MAX_FILE_SIZE);
00233 input_file.close();
00234
00235 filename outname(output_filename);
00236 if (outname.exists()) {
00237 non_continuable_error(class_name(), func, astring("the output file ")
00238 + output_filename + " already exists. It would be over-written if "
00239 "we continued.");
00240 }
00241
00242 byte_filer output_file(output_filename, "w");
00243 if (!output_file.good())
00244 non_continuable_error(class_name(), func, "the output file could not be opened");
00245
00246 enum parsing_states {
00247
00248 SEEKING_LINK_START,
00249 SEEKING_HREF,
00250 GETTING_URL,
00251 SEEKING_NAME,
00252 GETTING_NAME,
00253 SEEKING_CLOSURE,
00254
00255
00256 SAW_TITLE_START,
00257 GETTING_TITLE
00258 };
00259
00260 int curr_index = 0;
00261 parsing_states state = SEEKING_LINK_START;
00262 while (curr_index < full_contents.length()) {
00263 switch (state) {
00264 case SEEKING_LINK_START:
00265
00266
00267 if (full_contents[curr_index] != '<') {
00268 ADD_INTERMEDIATE;
00269 INCREM_N_GO;
00270 }
00271
00272
00273 curr_index++;
00274 if (caseless_equals('h', full_contents[curr_index])) {
00275
00276 const char next = full_contents[curr_index + 1];
00277 if ( (next >= '0') && (next <= '9') ) {
00278
00279
00280
00281 state = SAW_TITLE_START;
00282 INCREM_N_GO;
00283 }
00284 }
00285 if (!caseless_equals('a', full_contents[curr_index])) {
00286 intermediate_text += '<';
00287 JUMP_TO_CHAR('>', true);
00288 continue;
00289 }
00290
00291 curr_index++;
00292 if (!parser_bits::white_space(full_contents[curr_index])) {
00293 intermediate_text += "<a";
00294 JUMP_TO_CHAR('>', true);
00295 continue;
00296 }
00297
00298 NEXT_STATE_INCREM;
00299 break;
00300 case SEEKING_HREF:
00301 JUMP_TO_CHAR('h', false);
00302 curr_index++;
00303 if (!caseless_equals('r', full_contents[curr_index])) continue;
00304 curr_index++;
00305 if (!caseless_equals('e', full_contents[curr_index])) continue;
00306 curr_index++;
00307 if (!caseless_equals('f', full_contents[curr_index])) continue;
00308 curr_index++;
00309 if (full_contents[curr_index] != '=') continue;
00310 curr_index++;
00311 if (full_contents[curr_index] != '"') continue;
00312
00313
00314 NEXT_STATE_INCREM;
00315 break;
00316 case GETTING_URL:
00317
00318
00319 if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
00320 url_string += full_contents[curr_index];
00321 INCREM_N_GO;
00322 break;
00323 case SEEKING_NAME:
00324 JUMP_TO_CHAR('>', false);
00325 NEXT_STATE_INCREM;
00326 break;
00327 case GETTING_NAME:
00328
00329
00330 if (full_contents[curr_index] == '<') {
00331
00332 if (full_contents[curr_index + 1] == '/')
00333 NEXT_STATE_INCREM;
00334
00335 JUMP_TO_CHAR('>', false);
00336 curr_index++;
00337 continue;
00338 }
00339 name_string += full_contents[curr_index];
00340 INCREM_N_GO;
00341 break;
00342 case SEEKING_CLOSURE:
00343 JUMP_TO_CHAR('>', false);
00344
00345 WRITE_LINK;
00346
00347 RESET_STRINGS;
00348 state = SEEKING_LINK_START;
00349 INCREM_N_GO;
00350 break;
00351 case SAW_TITLE_START:
00352 heading_num = full_contents.substring(curr_index, curr_index);
00353 JUMP_TO_CHAR('>', false);
00354 NEXT_STATE_INCREM;
00355 break;
00356 case GETTING_TITLE: {
00357 int indy = full_contents.find('<', curr_index);
00358 if (negative(indy)) {
00359 state = SEEKING_LINK_START;
00360 continue;
00361 }
00362 last_title = full_contents.substring(curr_index, indy - 1);
00363 WRITE_SECTION;
00364 JUMP_TO_CHAR('<', false);
00365 JUMP_TO_CHAR('>', false);
00366 RESET_STRINGS;
00367 state = SEEKING_LINK_START;
00368 break;
00369 }
00370 default:
00371 non_continuable_error(class_name(), func, "entered erroneous state!");
00372 }
00373 }
00374
00375 if (url_string.t()) WRITE_LINK;
00376
00377 output_file.close();
00378
00379 BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
00380 _category_count));
00381
00382 return 0;
00383 }
00384
00386
00387 HOOPLE_MAIN(link_parser, )
00388