link_parser.cpp

Go to the documentation of this file.
00001 /*****************************************************************************\
00002 *                                                                             *
00003 *  Name   : link_parser                                                       *
00004 *  Author : Chris Koeritz                                                     *
00005 *                                                                             *
00006 *  Purpose:                                                                   *
00007 *                                                                             *
00008 *    Processes html files and finds the links.  A database in the HOOPLE      *
00009 *  link format is created from the links found.                               *
00010 *                                                                             *
00011 *******************************************************************************
00012 * Copyright (c) 1991-$now By Author.  This program is free software; you can  *
00013 * redistribute it and/or modify it under the terms of the GNU General Public  *
00014 * License as published by the Free Software Foundation; either version 2 of   *
00015 * the License or (at your option) any later version.  This is online at:      *
00016 *     http://www.fsf.org/copyleft/gpl.html                                    *
00017 * Please send any updates to: fred@gruntose.com                               *
00018 \*****************************************************************************/
00019 
00020 // Notes:
00021 //
00022 // the standard link structure in html is similar to this:
00023 //     <a href="blahblah">Link Name and Launching Point</a>
00024 //
00025 // the standard we adopt for section titles is that it must be a heading
00026 // marker.  that formatting looks like this, for example:
00027 //     <h3 assorted_stuff>The Section Title:</h3>
00028 
00029 #include <basis/function.h>
00030 #include <basis/guards.h>
00031 #include <basis/istring.h>
00032 #include <opsystem/application_shell.h>
00033 #include <opsystem/byte_filer.h>
00034 #include <opsystem/command_line.h>
00035 #include <loggers/file_logger.h>
00036 #include <opsystem/filename.h>
00037 #include <data_struct/static_memory_gremlin.h>
00038 #include <textual/parser_bits.h>
00039 
00040 #undef BASE_LOG
00041 #define BASE_LOG(s) program_wide_logger().log(s)
00042 #undef LOG
00043 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger(), s)
00044 
00045 const int MAX_FILE_SIZE = 4 * MEGABYTE;
00046   // this is the largest html file size we will process.
00047 
00049 
00050 // a macro that increments the position in the string and restarts the loop.
00051 #define INCREM_N_GO { curr_index++; continue; }
00052 
00053 // puts the current character on the intermediate string.
00054 #define ADD_INTERMEDIATE \
00055   intermediate_text += full_contents[curr_index]
00056 
00057 bool caseless_equals(char to_find, char comparing_with)
00058 {
00059   if (to_find == comparing_with) return true;
00060   if ( (to_find >= 'a') && (to_find <= 'z') 
00061       && (to_find - ('a' - 'A') == comparing_with) ) return true;
00062   return false;
00063 }
00064 
00065 // a macro that skips all characters until the specified one is seen.
00066 #define JUMP_TO_CHAR(to_find, save_them) { \
00067   while ( (curr_index < full_contents.length()) \
00068       && !caseless_equals(to_find, full_contents[curr_index]) ) { \
00069     if (save_them) ADD_INTERMEDIATE; \
00070     curr_index++; \
00071   } \
00072 }
00073 
00074 // increments the state, the current character and restarts the loop.
00075 #define NEXT_STATE_INCREM { \
00076   state = parsing_states(state+1);  /* move forward in states. */ \
00077   curr_index++; \
00078   continue; \
00079 }
00080 
00081 // cleans out the disallowed characters in the string provided.
00082 #define CLEAN_UP_NAUGHTY(s) { \
00083   while (s.replace("\n", " ")) {} \
00084   while (s.replace("\r", "")) {} \
00085   int indy = s.find("--"); \
00086   while (non_negative(indy)) { \
00087     s[indy] = ' ';  /* replace the first dash with a space. */ \
00088     for (int i = indy + 1; i < s.length(); i++) { \
00089       if (s[i] != '-') break; \
00090       s.zap(i, i); \
00091       i--; \
00092     } \
00093     indy = s.find("--"); \
00094   } \
00095   while (s.replace("  ", " ")) {} \
00096   s.strip_spaces(); \
00097 }
00098 
00099 // cleans up underscores in areas that are supposed to be english.
00100 #define MAKE_MORE_ENGLISH(s) \
00101   s.replace_all('_', ' ')
00102 
00103 void strain_out_html_codes(istring &to_edit)
00104 {
00105   for (int i = 0; i < to_edit.length(); i++) {
00106     if (to_edit[i] != '<') continue;
00107     // found a left bracket.
00108     int indy = to_edit.find('>', i);
00109     if (negative(indy)) return;  // bail out, unexpected unmatched bracket.
00110     to_edit.zap(i, indy);
00111     i--;  // skip back to reconsider current place.
00112   }
00113 }
00114 
00115 // writes out the currently accumulated link info.
00116 #define WRITE_LINK { \
00117   /* clean naughty characters out of the names. */ \
00118   CLEAN_UP_NAUGHTY(url_string); \
00119   CLEAN_UP_NAUGHTY(name_string); \
00120   while (name_string.replace("\n", " ")) {} \
00121   while (name_string.replace("\r", "")) {} \
00122   if (url_string.ends(name_string)) { \
00123     /* handle the name being boring. replace with the intermediate text. */ \
00124     MAKE_MORE_ENGLISH(intermediate_text); \
00125     strain_out_html_codes(intermediate_text); \
00126     CLEAN_UP_NAUGHTY(intermediate_text); \
00127     if (intermediate_text.length()) \
00128       name_string = intermediate_text; \
00129   } \
00130   /* output a link in the HOOPLE format. */ \
00131   istring to_write = "\"L\",\""; \
00132   to_write += name_string; \
00133   to_write += "\",\""; \
00134   to_write += last_title; \
00135   to_write += "\",\""; \
00136   to_write += url_string; \
00137   to_write += "\"\n"; \
00138   output_file.write(to_write); \
00139   _link_count++; \
00140 }
00141 
00142 // writes out the current section in the HOOPLE format.
00143 // currently the parent category is set to NOP.
00144 #define WRITE_SECTION { \
00145   CLEAN_UP_NAUGHTY(last_title);  /* clean the name. */ \
00146   /* output a category definition. */ \
00147   istring to_write = "\"C\",\""; \
00148   to_write += last_title; \
00149   to_write += "\",\""; \
00150   to_write += "NOP"; \
00151   to_write += "\"\n"; \
00152   output_file.write(to_write); \
00153   _category_count++; \
00154 }
00155 
00156 // clears our accumulator strings.
00157 #define RESET_STRINGS { \
00158   url_string = istring::empty_string(); \
00159   name_string = istring::empty_string(); \
00160   intermediate_text = istring::empty_string(); \
00161 }
00162 
00164 
00165 class link_parser : public application_shell
00166 {
00167 public:
00168   link_parser() : application_shell(static_class_name()), _link_count(0),
00169       _category_count(0) {}
00170   IMPLEMENT_CLASS_NAME("link_parser");
00171   virtual int execute();
00172   int print_instructions(const filename &program_name);
00173 
00174 private:
00175   int _link_count;  // number of links.
00176   int _category_count;  // number of categories.
00177 
00178   istring url_string;  // the URL we've parsed.
00179   istring name_string;  // the name that we've parsed for the URL.
00180   istring last_title;  // the last name that was set for a section.
00181   istring intermediate_text;  // strings we saw before a link.
00182 
00183   istring heading_num;
00184     // this string form of a number tracks what kind of heading was started.
00185 };
00186 
00188 
00189 int link_parser::print_instructions(const filename &program_name)
00190 {
00191   isprintf to_show("%s:\n\
00192 This program needs two filenames as command line parameters.  The -i flag\n\
00193 is used to specify the input filename and the -o flag specifies the output\n\
00194 file to be created.  The input file is expected to be an html file\n\
00195 containing links to assorted web sites.  The links are gathered, along with\n\
00196 descriptive text that happens to be near them, to create a link database in\n\
00197 the HOOPLE link format and write it to the output file.  HOOPLE link format\n\
00198 is basically a CSV file that defines the columns 1-4 for describing either\n\
00199 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
00200 interest).  The links are written to a CSV file in the standard HOOPLE link\n\
00201 The HOOPLE link format is documented here:\n\
00202     http://hoople.org/guides/link_database/format_manifesto.txt\n\
00203 ", program_name.basename().raw().s(), program_name.basename().raw().s());
00204   program_wide_logger().log(to_show.s());
00205   return 12;
00206 }
00207 
00208 int link_parser::execute()
00209 {
00210   FUNCDEF("main");
00211   SET_DEFAULT_COMBO_LOGGER;
00212 
00213   command_line cmds(__argc, __argv);  // process the command line parameters.
00214   istring input_filename;  // we'll store our bookmarks file's name here.
00215   istring output_filename;  // where the processed marks go.
00216   if (!cmds.get_value('i', input_filename, false))
00217     return print_instructions(cmds.program_name());
00218   if (!cmds.get_value('o', output_filename, false))
00219     return print_instructions(cmds.program_name());
00220 
00221   BASE_LOG(istring("input file: ") + input_filename);
00222   BASE_LOG(istring("output file: ") + output_filename);
00223 
00224   istring full_contents;
00225   byte_filer input_file(input_filename, "r");
00226   if (!input_file.good())
00227     non_continuable_error(class_name(), func, "the input file could not be opened");
00228   input_file.read(full_contents, MAX_FILE_SIZE);
00229   input_file.close();
00230 
00231   filename outname(output_filename);
00232   if (outname.exists()) {
00233     non_continuable_error(class_name(), func, istring("the output file ")
00234         + output_filename + " already exists.  It would be over-written if "
00235         "we continued.");
00236   }
00237 
00238   byte_filer output_file(output_filename, "w");
00239   if (!output_file.good())
00240     non_continuable_error(class_name(), func, "the output file could not be opened");
00241 
00242   enum parsing_states {
00243     // the states below are order dependent; do not change the ordering!
00244     SEEKING_LINK_START,  // looking for the beginning of an html link.
00245     SEEKING_HREF,  // finding the href portion of the link.
00246     GETTING_URL,  // chowing on the URL portion of the link.
00247     SEEKING_NAME,  // finding the closing bracket of the <a ...>.
00248     GETTING_NAME,  // chowing down on characters in the link's name.
00249     SEEKING_CLOSURE,  // looking for the </a> to end the link.
00250     // there is a discontinuity after SEEKING_CLOSURE, but then the following
00251     // states are also order dependent.
00252     SAW_TITLE_START,  // the beginning of a section heading was seen.
00253     GETTING_TITLE  // grabbing characters in the title.
00254   };
00255 
00256   int curr_index = 0;
00257   parsing_states state = SEEKING_LINK_START;
00258   while (curr_index < full_contents.length()) {
00259     switch (state) {
00260       case SEEKING_LINK_START:
00261         // if we don't see a less-than, then it's not the start of html code,
00262         // so we'll ignore it for now.
00263         if (full_contents[curr_index] != '<') {
00264           ADD_INTERMEDIATE;
00265           INCREM_N_GO;
00266         }
00267         // found a left angle bracket, so now we need to make sure this is
00268         // an address style code.
00269         curr_index++;
00270         if (caseless_equals('h', full_contents[curr_index])) {
00271           // check that we're seeing a heading definition here.
00272           const char next = full_contents[curr_index + 1];
00273           if ( (next >= '0') && (next <= '9') ) {
00274             // we found our proper character for starting a heading.  we need
00275             // to jump into that state now.  we'll leave the cursor at the
00276             // beginning of the number.
00277             state = SAW_TITLE_START;
00278             INCREM_N_GO;
00279           }
00280         }
00281         if (!caseless_equals('a', full_contents[curr_index])) {
00282           intermediate_text += '<';
00283           JUMP_TO_CHAR('>', true);
00284           continue; 
00285         }
00286         // found an a, but make sure that's the only character in the word.
00287         curr_index++;
00288         if (!parser_bits::white_space(full_contents[curr_index])) {
00289           intermediate_text += "<a";
00290           JUMP_TO_CHAR('>', true);
00291           continue; 
00292         }
00293         // this looks like an address so find the start of the href.
00294         NEXT_STATE_INCREM;
00295         break;
00296       case SEEKING_HREF:
00297         JUMP_TO_CHAR('h', false);  // find the next 'h' for "href".
00298         curr_index++;
00299         if (!caseless_equals('r', full_contents[curr_index])) continue;
00300         curr_index++;
00301         if (!caseless_equals('e', full_contents[curr_index])) continue;
00302         curr_index++;
00303         if (!caseless_equals('f', full_contents[curr_index])) continue;
00304         curr_index++;
00305         if (full_contents[curr_index] != '=') continue;
00306         curr_index++;
00307         if (full_contents[curr_index] != '"') continue;
00308         // whew, got through the word href and the assignment.  the rest
00309         // should all be part of the link.
00310         NEXT_STATE_INCREM;
00311         break;
00312       case GETTING_URL:
00313         // as long as we don't see the closure of the quoted string for the
00314         // href, then we can keep accumulating characters from it.
00315         if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
00316         url_string += full_contents[curr_index];
00317         INCREM_N_GO;  // keep chewing on it in this same state.
00318         break;
00319       case SEEKING_NAME:
00320         JUMP_TO_CHAR('>', false);  // find closing bracket.
00321         NEXT_STATE_INCREM;  // now start grabbing the name characters.
00322         break;
00323       case GETTING_NAME:
00324         // we have to stop grabbing name characters when we spy a new code
00325         // being started.
00326         if (full_contents[curr_index] == '<') {
00327           // if we see a closing command, then we assume it's the one we want.
00328           if (full_contents[curr_index + 1] == '/')
00329             NEXT_STATE_INCREM;
00330           // if we see html inside the name, we just throw it out.
00331           JUMP_TO_CHAR('>', false);
00332           curr_index++;
00333           continue;
00334         }
00335         name_string += full_contents[curr_index];
00336         INCREM_N_GO;  // keep chewing on it in this same state.
00337         break;
00338       case SEEKING_CLOSURE:
00339         JUMP_TO_CHAR('>', false);  // find the closure of the html code.
00340         // write the link out now.
00341         WRITE_LINK;
00342         // clean out our accumulated strings.
00343         RESET_STRINGS;
00344         state = SEEKING_LINK_START;
00345         INCREM_N_GO;
00346         break;
00347       case SAW_TITLE_START:
00348         heading_num = full_contents.substring(curr_index, curr_index);
00349         JUMP_TO_CHAR('>', false);
00350         NEXT_STATE_INCREM;  // start eating the name.
00351         break;
00352       case GETTING_TITLE: {
00353         int indy = full_contents.find('<', curr_index);
00354         if (negative(indy)) {
00355           state = SEEKING_LINK_START;  // too weird, go back to start.
00356           continue;
00357         }
00358         last_title = full_contents.substring(curr_index, indy - 1);
00359         WRITE_SECTION;
00360         JUMP_TO_CHAR('<', false);  // now find the start of the header closure.
00361         JUMP_TO_CHAR('>', false);  // now find the end of the header closure.
00362         RESET_STRINGS;
00363         state = SEEKING_LINK_START;  // successfully found section name.
00364         break;
00365       }
00366       default:
00367         non_continuable_error(class_name(), func, "entered erroneous state!");
00368     }
00369   }
00370 
00371   if (url_string.t()) WRITE_LINK;
00372 
00373   output_file.close();
00374 
00375   BASE_LOG(isprintf("wrote %d links in %d categories.", _link_count,
00376       _category_count));
00377 
00378   return 0;
00379 }
00380 
00382 
00383 HOOPLE_MAIN(link_parser, )
00384 
00385 #ifdef __BUILD_STATIC_APPLICATION__
00386   // static dependencies found by buildor_gen_deps.sh:
00387   #include <basis/array.cpp>
00388   #include <basis/byte_array.cpp>
00389   #include <basis/callstack_tracker.cpp>
00390   #include <basis/chaos.cpp>
00391   #include <basis/convert_utf.cpp>
00392   #include <basis/definitions.cpp>
00393   #include <basis/earth_time.cpp>
00394   #include <basis/guards.cpp>
00395   #include <basis/istring.cpp>
00396   #include <basis/log_base.cpp>
00397   #include <basis/memory_checker.cpp>
00398   #include <basis/mutex.cpp>
00399   #include <basis/object_base.cpp>
00400   #include <basis/outcome.cpp>
00401   #include <basis/packable.cpp>
00402   #include <basis/portable.cpp>
00403   #include <basis/sequence.cpp>
00404   #include <basis/set.cpp>
00405   #include <basis/utility.cpp>
00406   #include <basis/version_record.cpp>
00407   #include <data_struct/amorph.cpp>
00408   #include <data_struct/bit_vector.cpp>
00409   #include <data_struct/byte_hasher.cpp>
00410   #include <data_struct/configurator.cpp>
00411   #include <data_struct/hash_table.cpp>
00412   #include <data_struct/pointer_hash.cpp>
00413   #include <data_struct/stack.cpp>
00414   #include <data_struct/static_memory_gremlin.cpp>
00415   #include <data_struct/string_hash.cpp>
00416   #include <data_struct/string_hasher.cpp>
00417   #include <data_struct/string_table.cpp>
00418   #include <data_struct/symbol_table.cpp>
00419   #include <data_struct/table_configurator.cpp>
00420   #include <loggers/console_logger.cpp>
00421   #include <loggers/file_logger.cpp>
00422   #include <loggers/locked_logger.cpp>
00423   #include <loggers/null_logger.cpp>
00424   #include <loggers/program_wide_logger.cpp>
00425   #include <opsystem/application_base.cpp>
00426   #include <opsystem/application_shell.cpp>
00427   #include <opsystem/byte_filer.cpp>
00428   #include <opsystem/command_line.cpp>
00429   #include <opsystem/critical_events.cpp>
00430   #include <opsystem/directory.cpp>
00431   #include <opsystem/filename.cpp>
00432   #include <opsystem/ini_config.cpp>
00433   #include <opsystem/ini_parser.cpp>
00434   #include <opsystem/path_configuration.cpp>
00435   #include <opsystem/rendezvous.cpp>
00436   #include <textual/byte_format.cpp>
00437   #include <textual/parser_bits.cpp>
00438   #include <textual/string_manipulation.cpp>
00439   #include <textual/tokenizer.cpp>
00440 #endif // __BUILD_STATIC_APPLICATION__
00441 

Generated on Fri Nov 21 04:28:58 2008 for HOOPLE Libraries by  doxygen 1.5.1