link_parser.cpp

Go to the documentation of this file.
00001 /*****************************************************************************\
00002 *                                                                             *
00003 *  Name   : link_parser                                                       *
00004 *  Author : Chris Koeritz                                                     *
00005 *                                                                             *
00006 *  Purpose:                                                                   *
00007 *                                                                             *
00008 *    Processes html files and finds the links.  A database in the HOOPLE      *
00009 *  link format is created from the links found.                               *
00010 *                                                                             *
00011 *******************************************************************************
00012 * Copyright (c) 1991-$now By Author.  This program is free software; you can  *
00013 * redistribute it and/or modify it under the terms of the GNU General Public  *
00014 * License as published by the Free Software Foundation; either version 2 of   *
00015 * the License or (at your option) any later version.  This is online at:      *
00016 *     http://www.fsf.org/copyleft/gpl.html                                    *
00017 * Please send any updates to: fred@gruntose.com                               *
00018 \*****************************************************************************/
00019 
00020 // Notes:
00021 //
00022 // the standard link structure in html is similar to this:
00023 //     <a href="blahblah">Link Name and Launching Point</a>
00024 //
00025 // the standard we adopt for section titles is that it must be a heading
00026 // marker.  that formatting looks like this, for example:
00027 //     <h3 assorted_stuff>The Section Title:</h3>
00028 
00029 #include <application/hoople_main.h>
00030 #include <basis/astring.h>
00031 #include <basis/functions.h>
00032 #include <basis/guards.h>
00033 #include <filesystem/byte_filer.h>
00034 #include <filesystem/filename.h>
00035 #include <loggers/critical_events.h>
00036 #include <loggers/file_logger.h>
00037 #include <structures/static_memory_gremlin.h>
00038 #include <textual/parser_bits.h>
00039 
00040 using namespace application;
00041 using namespace basis;
00042 using namespace filesystem;
00043 using namespace loggers;
00044 using namespace structures;
00045 using namespace textual;
00046 
00047 #undef BASE_LOG
00048 #define BASE_LOG(s) program_wide_logger::get().log(s, ALWAYS_PRINT)
00049 #undef LOG
00050 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), s)
00051 
00052 const int MAX_FILE_SIZE = 4 * MEGABYTE;
00053   // this is the largest html file size we will process.
00054 
00056 
00057 // a macro that increments the position in the string and restarts the loop.
00058 #define INCREM_N_GO { curr_index++; continue; }
00059 
00060 // puts the current character on the intermediate string.
00061 #define ADD_INTERMEDIATE \
00062   intermediate_text += full_contents[curr_index]
00063 
00064 bool caseless_equals(char to_find, char comparing_with)
00065 {
00066   if (to_find == comparing_with) return true;
00067   if ( (to_find >= 'a') && (to_find <= 'z') 
00068       && (to_find - ('a' - 'A') == comparing_with) ) return true;
00069   return false;
00070 }
00071 
00072 // a macro that skips all characters until the specified one is seen.
00073 #define JUMP_TO_CHAR(to_find, save_them) { \
00074   while ( (curr_index < full_contents.length()) \
00075       && !caseless_equals(to_find, full_contents[curr_index]) ) { \
00076     if (save_them) ADD_INTERMEDIATE; \
00077     curr_index++; \
00078   } \
00079 }
00080 
00081 // increments the state, the current character and restarts the loop.
00082 #define NEXT_STATE_INCREM { \
00083   state = parsing_states(state+1);  /* move forward in states. */ \
00084   curr_index++; \
00085   continue; \
00086 }
00087 
00088 // cleans out the disallowed characters in the string provided.
00089 #define CLEAN_UP_NAUGHTY(s) { \
00090   while (s.replace("\n", " ")) {} \
00091   while (s.replace("\r", "")) {} \
00092   int indy = s.find("--"); \
00093   while (non_negative(indy)) { \
00094     s[indy] = ' ';  /* replace the first dash with a space. */ \
00095     for (int i = indy + 1; i < s.length(); i++) { \
00096       if (s[i] != '-') break; \
00097       s.zap(i, i); \
00098       i--; \
00099     } \
00100     indy = s.find("--"); \
00101   } \
00102   while (s.replace("  ", " ")) {} \
00103   s.strip_spaces(); \
00104 }
00105 
00106 // cleans up underscores in areas that are supposed to be english.
00107 #define MAKE_MORE_ENGLISH(s) \
00108   s.replace_all('_', ' ')
00109 
00110 void strain_out_html_codes(astring &to_edit)
00111 {
00112   for (int i = 0; i < to_edit.length(); i++) {
00113     if (to_edit[i] != '<') continue;
00114     // found a left bracket.
00115     int indy = to_edit.find('>', i);
00116     if (negative(indy)) return;  // bail out, unexpected unmatched bracket.
00117     to_edit.zap(i, indy);
00118     i--;  // skip back to reconsider current place.
00119   }
00120 }
00121 
00122 // writes out the currently accumulated link info.
00123 #define WRITE_LINK { \
00124   /* clean naughty characters out of the names. */ \
00125   CLEAN_UP_NAUGHTY(url_string); \
00126   CLEAN_UP_NAUGHTY(name_string); \
00127   while (name_string.replace("\n", " ")) {} \
00128   while (name_string.replace("\r", "")) {} \
00129   if (url_string.ends(name_string)) { \
00130     /* handle the name being boring. replace with the intermediate text. */ \
00131     MAKE_MORE_ENGLISH(intermediate_text); \
00132     strain_out_html_codes(intermediate_text); \
00133     CLEAN_UP_NAUGHTY(intermediate_text); \
00134     if (intermediate_text.length()) \
00135       name_string = intermediate_text; \
00136   } \
00137   /* output a link in the HOOPLE format. */ \
00138   astring to_write = "\"L\",\""; \
00139   to_write += name_string; \
00140   to_write += "\",\""; \
00141   to_write += last_title; \
00142   to_write += "\",\""; \
00143   to_write += url_string; \
00144   to_write += "\"\n"; \
00145   output_file.write(to_write); \
00146   _link_count++; \
00147 }
00148 
00149 // writes out the current section in the HOOPLE format.
00150 // currently the parent category is set to Root.
00151 #define WRITE_SECTION { \
00152   CLEAN_UP_NAUGHTY(last_title);  /* clean the name. */ \
00153   /* output a category definition. */ \
00154   astring to_write = "\"C\",\""; \
00155   to_write += last_title; \
00156   to_write += "\",\""; \
00157   to_write += "Root"; \
00158   to_write += "\"\n"; \
00159   output_file.write(to_write); \
00160   _category_count++; \
00161 }
00162 
00163 // clears our accumulator strings.
00164 #define RESET_STRINGS { \
00165   url_string = astring::empty_string(); \
00166   name_string = astring::empty_string(); \
00167   intermediate_text = astring::empty_string(); \
00168 }
00169 
00171 
00172 class link_parser : public application_shell
00173 {
00174 public:
00175   link_parser() : application_shell(), _link_count(0), _category_count(0) {}
00176   DEFINE_CLASS_NAME("link_parser");
00177   virtual int execute();
00178   int print_instructions(const filename &program_name);
00179 
00180 private:
00181   int _link_count;  // number of links.
00182   int _category_count;  // number of categories.
00183 
00184   astring url_string;  // the URL we've parsed.
00185   astring name_string;  // the name that we've parsed for the URL.
00186   astring last_title;  // the last name that was set for a section.
00187   astring intermediate_text;  // strings we saw before a link.
00188 
00189   astring heading_num;
00190     // this string form of a number tracks what kind of heading was started.
00191 };
00192 
00194 
00195 int link_parser::print_instructions(const filename &program_name)
00196 {
00197   a_sprintf to_show("%s:\n\
00198 This program needs two filenames as command line parameters.  The -i flag\n\
00199 is used to specify the input filename and the -o flag specifies the output\n\
00200 file to be created.  The input file is expected to be an html file\n\
00201 containing links to assorted web sites.  The links are gathered, along with\n\
00202 descriptive text that happens to be near them, to create a link database in\n\
00203 the HOOPLE link format and write it to the output file.  HOOPLE link format\n\
00204 is basically a CSV file that defines the columns 1-4 for describing either\n\
00205 link categories (which support hierarchies) or actual links (i.e., URLs of\n\
00206 interest).  The links are written to a CSV file in the standard HOOPLE link\n\
00207 The HOOPLE link format is documented here:\n\
00208     http://hoople.org/guides/link_database/format_manifesto.txt\n\
00209 ", program_name.basename().raw().s(), program_name.basename().raw().s());
00210   program_wide_logger::get().log(to_show, ALWAYS_PRINT);
00211   return 12;
00212 }
00213 
00214 int link_parser::execute()
00215 {
00216   FUNCDEF("main");
00217   command_line cmds(_global_argc, _global_argv);  // process the command line parameters.
00218   astring input_filename;  // we'll store our bookmarks file's name here.
00219   astring output_filename;  // where the processed marks go.
00220   if (!cmds.get_value('i', input_filename, false))
00221     return print_instructions(cmds.program_name());
00222   if (!cmds.get_value('o', output_filename, false))
00223     return print_instructions(cmds.program_name());
00224 
00225   BASE_LOG(astring("input file: ") + input_filename);
00226   BASE_LOG(astring("output file: ") + output_filename);
00227 
00228   astring full_contents;
00229   byte_filer input_file(input_filename, "r");
00230   if (!input_file.good())
00231     non_continuable_error(class_name(), func, "the input file could not be opened");
00232   input_file.read(full_contents, MAX_FILE_SIZE);
00233   input_file.close();
00234 
00235   filename outname(output_filename);
00236   if (outname.exists()) {
00237     non_continuable_error(class_name(), func, astring("the output file ")
00238         + output_filename + " already exists.  It would be over-written if "
00239         "we continued.");
00240   }
00241 
00242   byte_filer output_file(output_filename, "w");
00243   if (!output_file.good())
00244     non_continuable_error(class_name(), func, "the output file could not be opened");
00245 
00246   enum parsing_states {
00247     // the states below are order dependent; do not change the ordering!
00248     SEEKING_LINK_START,  // looking for the beginning of an html link.
00249     SEEKING_HREF,  // finding the href portion of the link.
00250     GETTING_URL,  // chowing on the URL portion of the link.
00251     SEEKING_NAME,  // finding the closing bracket of the <a ...>.
00252     GETTING_NAME,  // chowing down on characters in the link's name.
00253     SEEKING_CLOSURE,  // looking for the </a> to end the link.
00254     // there is a discontinuity after SEEKING_CLOSURE, but then the following
00255     // states are also order dependent.
00256     SAW_TITLE_START,  // the beginning of a section heading was seen.
00257     GETTING_TITLE  // grabbing characters in the title.
00258   };
00259 
00260   int curr_index = 0;
00261   parsing_states state = SEEKING_LINK_START;
00262   while (curr_index < full_contents.length()) {
00263     switch (state) {
00264       case SEEKING_LINK_START:
00265         // if we don't see a less-than, then it's not the start of html code,
00266         // so we'll ignore it for now.
00267         if (full_contents[curr_index] != '<') {
00268           ADD_INTERMEDIATE;
00269           INCREM_N_GO;
00270         }
00271         // found a left angle bracket, so now we need to make sure this is
00272         // an address style code.
00273         curr_index++;
00274         if (caseless_equals('h', full_contents[curr_index])) {
00275           // check that we're seeing a heading definition here.
00276           const char next = full_contents[curr_index + 1];
00277           if ( (next >= '0') && (next <= '9') ) {
00278             // we found our proper character for starting a heading.  we need
00279             // to jump into that state now.  we'll leave the cursor at the
00280             // beginning of the number.
00281             state = SAW_TITLE_START;
00282             INCREM_N_GO;
00283           }
00284         }
00285         if (!caseless_equals('a', full_contents[curr_index])) {
00286           intermediate_text += '<';
00287           JUMP_TO_CHAR('>', true);
00288           continue; 
00289         }
00290         // found an a, but make sure that's the only character in the word.
00291         curr_index++;
00292         if (!parser_bits::white_space(full_contents[curr_index])) {
00293           intermediate_text += "<a";
00294           JUMP_TO_CHAR('>', true);
00295           continue; 
00296         }
00297         // this looks like an address so find the start of the href.
00298         NEXT_STATE_INCREM;
00299         break;
00300       case SEEKING_HREF:
00301         JUMP_TO_CHAR('h', false);  // find the next 'h' for "href".
00302         curr_index++;
00303         if (!caseless_equals('r', full_contents[curr_index])) continue;
00304         curr_index++;
00305         if (!caseless_equals('e', full_contents[curr_index])) continue;
00306         curr_index++;
00307         if (!caseless_equals('f', full_contents[curr_index])) continue;
00308         curr_index++;
00309         if (full_contents[curr_index] != '=') continue;
00310         curr_index++;
00311         if (full_contents[curr_index] != '"') continue;
00312         // whew, got through the word href and the assignment.  the rest
00313         // should all be part of the link.
00314         NEXT_STATE_INCREM;
00315         break;
00316       case GETTING_URL:
00317         // as long as we don't see the closure of the quoted string for the
00318         // href, then we can keep accumulating characters from it.
00319         if (full_contents[curr_index] == '"') NEXT_STATE_INCREM;
00320         url_string += full_contents[curr_index];
00321         INCREM_N_GO;  // keep chewing on it in this same state.
00322         break;
00323       case SEEKING_NAME:
00324         JUMP_TO_CHAR('>', false);  // find closing bracket.
00325         NEXT_STATE_INCREM;  // now start grabbing the name characters.
00326         break;
00327       case GETTING_NAME:
00328         // we have to stop grabbing name characters when we spy a new code
00329         // being started.
00330         if (full_contents[curr_index] == '<') {
00331           // if we see a closing command, then we assume it's the one we want.
00332           if (full_contents[curr_index + 1] == '/')
00333             NEXT_STATE_INCREM;
00334           // if we see html inside the name, we just throw it out.
00335           JUMP_TO_CHAR('>', false);
00336           curr_index++;
00337           continue;
00338         }
00339         name_string += full_contents[curr_index];
00340         INCREM_N_GO;  // keep chewing on it in this same state.
00341         break;
00342       case SEEKING_CLOSURE:
00343         JUMP_TO_CHAR('>', false);  // find the closure of the html code.
00344         // write the link out now.
00345         WRITE_LINK;
00346         // clean out our accumulated strings.
00347         RESET_STRINGS;
00348         state = SEEKING_LINK_START;
00349         INCREM_N_GO;
00350         break;
00351       case SAW_TITLE_START:
00352         heading_num = full_contents.substring(curr_index, curr_index);
00353         JUMP_TO_CHAR('>', false);
00354         NEXT_STATE_INCREM;  // start eating the name.
00355         break;
00356       case GETTING_TITLE: {
00357         int indy = full_contents.find('<', curr_index);
00358         if (negative(indy)) {
00359           state = SEEKING_LINK_START;  // too weird, go back to start.
00360           continue;
00361         }
00362         last_title = full_contents.substring(curr_index, indy - 1);
00363         WRITE_SECTION;
00364         JUMP_TO_CHAR('<', false);  // now find the start of the header closure.
00365         JUMP_TO_CHAR('>', false);  // now find the end of the header closure.
00366         RESET_STRINGS;
00367         state = SEEKING_LINK_START;  // successfully found section name.
00368         break;
00369       }
00370       default:
00371         non_continuable_error(class_name(), func, "entered erroneous state!");
00372     }
00373   }
00374 
00375   if (url_string.t()) WRITE_LINK;
00376 
00377   output_file.close();
00378 
00379   BASE_LOG(a_sprintf("wrote %d links in %d categories.", _link_count,
00380       _category_count));
00381 
00382   return 0;
00383 }
00384 
00386 
00387 HOOPLE_MAIN(link_parser, )
00388 
Generated on Sat Jan 28 04:21:58 2012 for hoople2 project by  doxygen 1.6.3