bookmark_tree.cpp

Go to the documentation of this file.
00001 /*****************************************************************************\
00002 *                                                                             *
00003 *  Name   : bookmark_tree                                                     *
00004 *  Author : Chris Koeritz                                                     *
00005 *                                                                             *
00006 *******************************************************************************
00007 * Copyright (c) 2005-$now By Author.  This program is free software; you can  *
00008 * redistribute it and/or modify it under the terms of the GNU General Public  *
00009 * License as published by the Free Software Foundation; either version 2 of   *
00010 * the License or (at your option) any later version.  This is online at:      *
00011 *     http://www.fsf.org/copyleft/gpl.html                                    *
00012 * Please send any updates to: fred@gruntose.com                               *
00013 \*****************************************************************************/
00014 
00015 #include "bookmark_tree.h"
00016 
00017 #include <basis/astring.h>
00018 #include <basis/functions.h>
00019 #include <basis/guards.h>
00020 #include <filesystem/byte_filer.h>
00021 #include <filesystem/filename.h>
00022 #include <loggers/critical_events.h>
00023 #include <loggers/file_logger.h>
00024 #include <loggers/program_wide_logger.h>
00025 #include <nodes/symbol_tree.h>
00026 #include <structures/amorph.h>
00027 #include <structures/string_table.cpp>
00028 #include <structures/symbol_table.h>
00029 #include <textual/list_parsing.h>
00030 #include <textual/parser_bits.h>
00031 
00033 
00034 using namespace basis;
00035 using namespace filesystem;
00036 using namespace loggers;
00037 using namespace nodes;
00038 using namespace structures;
00039 using namespace textual;
00040 
00041 #define DEBUG_MARKS
00042   // uncomment to have more debugging noise, but a reasonable amount.
00043 //#define DEBUG_MARKS_TREE
00044   // uncomment to get crazy noisy debug noise about tree traversal.
00045 
00046 #undef BASE_LOG
00047 #define BASE_LOG(s) program_wide_logger::get().log(s)
00048 #define SHOW_LINE a_sprintf("line %d: ", _line_number)
00049 #undef LOG
00050 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger::get(), SHOW_LINE + s)
00051 #define DEADLY_LINE (astring(func) + a_sprintf(", line %d: ", _line_number))
00052 
00053 const int ESTIMATED_ELEMENTS = 100;
00054   // we're planning for about this many links to be efficiently handled.
00055 
00056 const int MAX_LINE_SIZE = 256 * KILOBYTE;
00057   // the largest line we'll process in the links database.
00058 
00059 // used to compare two strings while ignoring case; we use this to find
00060 // our categories in the symbol table.
00061 bool case_insense_compare(const astring &a, const astring &b)
00062 { return a.iequals(b); }
00063 
00065 
00066 listo_links::listo_links() : amorph<link_record>(), _next_index(0) {}
00067 
00068 void listo_links::add(link_record *new_rec, bool sort)
00069 {
00070   // we don't sort blank lines--they just get dropped in the end of
00071   // the section.
00072   if (sort && new_rec->_description.t()) {
00073     for (int i = _next_index; i < elements(); i++) {
00074       const astring &desc_cur = borrow(i)->_description;
00075 //this check doesn't make much sense; it only checks if the description is equal?
00076 // if it were really sorting, wouldn't it need to check if the check is greater than current?
00077       if (desc_cur.iequals(new_rec->_description)
00078 //          || shouldn't there be a case for this being greater than the current???
00079           || !desc_cur) {
00080         insert(i + 1, 1);
00081         put(i + 1, new_rec);
00082         return;
00083       }
00084     }
00085   }
00086   append(new_rec);
00087   if (!sort)
00088     _next_index = elements();
00089 }
00090 
00092 
00093 class symbol_int : public symbol_table<int>
00094 {
00095 public:
00096   symbol_int() : symbol_table<int>(10) {}
00097 };
00098 
00100 
00101 bookmark_tree::bookmark_tree()
00102 : _line_number(0),
00103   _mark_tree(new inner_mark_tree("Root", 0, ESTIMATED_ELEMENTS)),
00104   _link_count(0),
00105   _category_count(0),
00106   _last_parent(_mark_tree),
00107   _last_node(_mark_tree),
00108   _links_seen(new symbol_int),
00109   _category_names(new string_table)
00110 {}
00111 
00112 bookmark_tree::~bookmark_tree()
00113 {
00114   WHACK(_links_seen);
00115   WHACK(_mark_tree);
00116   WHACK(_category_names);
00117 }
00118 
00119 void bookmark_tree::break_name(const astring &to_break, astring &name,
00120     astring &nick)
00121 {
00122   nick = astring::empty_string();
00123   name = to_break;
00124   int indy = name.find('[');
00125   if (negative(indy)) return;
00126   nick = name.substring(indy + 1, name.end());
00127   while ( (nick[nick.end()] == ' ') || (nick[nick.end()] == ']') )
00128     nick.zap(nick.end(), nick.end());
00129   name.zap(indy, name.end());
00130   name.strip_spaces();
00131   nick.strip_spaces();
00132 }
00133 
00134 inner_mark_tree &bookmark_tree::access_root() { return *_mark_tree; }
00135 
00136 bool bookmark_tree::magic_category_comparison(const astring &a, const astring &b)
00137 {
00138 //  FUNCDEF("magic_category_comparison");
00139 //LOG(astring("compare: a=") + a + " b=" + b);
00140   if (a.iequals(b)) return true;
00141   astring a_name, a_nick;
00142   break_name(a, a_name, a_nick);
00143   astring b_name, b_nick;
00144   break_name(b, b_name, b_nick);
00145   if (a_name.iequals(b_name)) return true;
00146   if (a_nick.t() && a_nick.iequals(b_name)) return true;
00147   if (b_nick.t() && a_name.iequals(b_nick)) return true;
00148   if (a_nick.t() && b_nick.t() && a_nick.iequals(b_nick)) return true;
00149   return false;
00150 }
00151 
00152 const astring &HTTP_HEAD = "http://";
00153 const astring &FTP_HEAD = "ftp://";
00154 const astring &WWW_SITE = "www.";
00155 const astring &FTP_SITE = "ftp.";
00156 
00157 bool bookmark_tree::advance(int &index, const astring &check, const astring &finding)
00158 {
00159   if (check.compare(finding, index, 0, finding.length(), false)) {
00160     index += finding.length();
00161     return true;
00162   }
00163   return false;
00164 }
00165 
00166 int bookmark_tree::find_prune_point(const astring &to_prune)
00167 {
00168   int to_return = 0;
00169   advance(to_return, to_prune, HTTP_HEAD);
00170   advance(to_return, to_prune, FTP_HEAD);
00171   advance(to_return, to_prune, WWW_SITE);
00172   advance(to_return, to_prune, FTP_SITE);
00173   return to_return;
00174 }
00175 
00176 astring bookmark_tree::prune_link_down(const astring &to_prune)
00177 {
00178 //printf("%s\n", (astring("pruned=") + to_prune.substring(find_prune_point(to_prune), to_prune.end())).s());
00179  return to_prune.substring(find_prune_point(to_prune), to_prune.end()); }
00180 
00181 bool bookmark_tree::excellent_link_comparator(const astring &a, const astring &b)
00182 {
00183   int prune_a = find_prune_point(a);
00184   int prune_b = find_prune_point(b);
00185   int bigger_len = maximum(a.length() - prune_a, b.length() - prune_b);
00186   bool to_return = a.compare(b, prune_a, prune_b, bigger_len, false);
00187 //if (to_return) printf("%s and %s are equal.", a.s(), b.s());
00188   return to_return;
00189 }
00190 
00191 inner_mark_tree *bookmark_tree::find_parent(const astring &parent_name)
00192 {
00193   FUNCDEF("find_parent");
00194   // first, look for the node above the last parent.
00195   inner_mark_tree *parent = dynamic_cast<inner_mark_tree *>
00196       (_last_parent->find(parent_name, inner_mark_tree::recurse_upward,
00197        magic_category_comparison));
00198 
00199 #ifdef DEBUG_MARKS_TREE
00200   if (parent)
00201     LOG(astring("trying upwards find for parent node ") + parent_name);
00202 #endif
00203 
00204   if (!parent) {
00205 #ifdef DEBUG_MARKS_TREE
00206     LOG(astring("upwards find failed seeking on last_parent node ")
00207         + parent_name);
00208 #endif
00209 
00210     // we didn't find the parent above the last category...
00211     parent = dynamic_cast<inner_mark_tree *>(_last_node->find(parent_name,
00212         inner_mark_tree::recurse_upward, magic_category_comparison));
00213   }
00214 
00215   if (!parent) {
00216 #ifdef DEBUG_MARKS_TREE
00217     LOG(astring("upwards find failed seeking on last_node ") + parent_name);
00218 #endif
00219 
00220     // last node didn't help either.
00221     parent = dynamic_cast<inner_mark_tree *>(_mark_tree->find(parent_name,
00222         inner_mark_tree::recurse_downward, magic_category_comparison));
00223   }
00224   if (!parent) {
00225     // failed to find the parent node, so hook it to the root node.
00226     LOG(astring("failed to find parent node ") + parent_name);
00227 
00228     // create a parent node and use it for this guy.
00229     inner_mark_tree *new_node = new inner_mark_tree(parent_name,
00230         _line_number, ESTIMATED_ELEMENTS);
00231     _mark_tree->attach(new_node);
00232     _mark_tree->sort();
00233     _category_count++;
00234 
00235     parent = new_node;
00236   } else {
00237 #ifdef DEBUG_MARKS_TREE
00238     LOG(astring("found parent node ") + parent_name);
00239 #endif
00240   }
00241 
00242   return parent;
00243 }
00244 
00245 inner_mark_tree *bookmark_tree::process_category(const string_array &items)
00246 {
00247   FUNCDEF("process_category");
00248   const astring &category_name = items[1];
00249   const astring &parent_name = items[2];
00250 
00251   if (items.length() > 3) {
00252     // complain about a possibly malformed category.
00253     LOG(astring("category ") + category_name + " under " + parent_name
00254         + " has extra fields!");
00255   }
00256 
00257 //BASE_LOG("CURRENT:");
00258 //BASE_LOG(_mark_tree->text_form());
00259 
00260   // make sure we don't add anything to the tree if this is the root.
00261   if (!parent_name || magic_category_comparison("Root", category_name)) {
00262 #ifdef DEBUG_MARKS_TREE
00263     LOG(astring("skipping parent node for ") + category_name);
00264 #endif
00265     return _mark_tree;
00266   }
00267 
00268   // ensure that the categories aren't competing with other names.
00269   astring main_name, nickname;
00270   break_name(category_name, main_name, nickname);
00271   astring *found1 = _category_names->find(main_name, case_insense_compare);
00272   astring *found2 = _category_names->find(nickname, case_insense_compare);
00273   if (found1 || found2) {
00274     astring catnames;
00275     if (found1) catnames = *found1;  // add the first match, if it exists.
00276     if (found2) {
00277       if (!!catnames) catnames += " and ";
00278       catnames += *found2;
00279     }
00280     LOG(astring("category name \"") + category_name
00281         + "\" in conflict with existing: " + catnames);
00282     inner_mark_tree *fake_it = NIL;
00283 
00284 //hmmm: neither of these are right; they need to use a comparator that
00285 //      uses our magic comparison function.
00286 
00287     if (found1) {
00288 #ifdef DEBUG_MARKS
00289       LOG(astring("found existing category for main name: ") + main_name);
00290 #endif
00291 //      fake_it = (inner_mark_tree *)_mark_tree->find(*found1,
00292 //          symbol_tree::recurse_downward);
00293       fake_it = dynamic_cast<inner_mark_tree *>(_mark_tree->find
00294           (*found1, inner_mark_tree::recurse_downward,
00295            magic_category_comparison));
00296     }
00297     if (fake_it) {
00298 #ifdef DEBUG_MARKS
00299       LOG(astring("returning existing category for main name: ") + main_name
00300           + " as: " + fake_it->name());
00301 #endif
00302       return fake_it;
00303     }
00304     if (found2) {
00305 #ifdef DEBUG_MARKS
00306       LOG(astring("found existing category for nickname: ") + nickname);
00307 #endif
00308 
00309 
00310       fake_it = dynamic_cast<inner_mark_tree *>(_mark_tree->find
00311           (*found2, inner_mark_tree::recurse_downward,
00312            magic_category_comparison));
00313     }
00314     if (fake_it) {
00315 #ifdef DEBUG_MARKS
00316       LOG(astring("returning existing category for nickname: ") + nickname
00317           + " as: " + fake_it->name());
00318 #endif
00319       return fake_it;
00320     }
00321     LOG("==> failure to find a match for either category!");
00322     deadly_error(class_name(), func, "collision resolution code failed; "
00323         "please fix category error");
00324     return NIL;
00325   }
00326   // now that we know these names are unique, we'll add them into the list
00327   // so future categories can't reuse these.
00328   _category_names->add(main_name, main_name);
00329   if (!!nickname) _category_names->add(nickname, nickname);
00330 
00331   inner_mark_tree *parent = find_parent(parent_name);
00332   _last_parent = parent;  // set the parent for the next time.
00333 
00334   // see if the category is already present under the parent.
00335   for (int i = 0; i < parent->branches(); i++) {
00336     inner_mark_tree *curr = dynamic_cast<inner_mark_tree *>(parent->branch(i));
00337     if (!curr)
00338       non_continuable_error(class_name(), DEADLY_LINE, "missing branch in tree");
00339 #ifdef DEBUG_MARKS_TREE
00340     LOG(astring("looking at branch ") + curr->name());
00341 #endif
00342     if (magic_category_comparison(curr->name(), category_name)) {
00343       // it already exists?  argh.
00344       LOG(astring("category ") + category_name + " already exists under "
00345           + parent_name + ".");
00346       _last_node = curr;
00347       return curr;
00348     }
00349   }
00350 
00351   inner_mark_tree *new_node = new inner_mark_tree(category_name,
00352       _line_number, ESTIMATED_ELEMENTS);
00353   parent->attach(new_node);
00354   parent->sort();
00355   _last_node = new_node;
00356 
00357   _category_count++;
00358 
00359 #ifdef DEBUG_MARKS_TREE
00360   LOG(astring("attaching node ") + category_name + " to parent "
00361       + parent->name());
00362 #endif
00363   return new_node;
00364 }
00365 
00366 void bookmark_tree::process_link(const string_array &items)
00367 {
00368   FUNCDEF("process_link");
00369   astring description = items[1];
00370   astring parent_name = items[2];
00371   astring url = "UNKNOWN";
00372   if (items.length() >= 4) url = items[3];
00373 
00374   // strip any directory slashes that are provided as a suffix.  we don't need
00375   // them and they tend to confuse the issue when we look for duplicates.
00376   while (url[url.end()] == '/') {
00377     url.zap(url.end(), url.end());
00378   }
00379 
00380   // make some noise if they seem to have a badly formed link.
00381   if (items.length() < 4) {
00382     LOG(astring("link ") + description + " under " + parent_name
00383         + " has no URL!");
00384   } else if (items.length() > 4) {
00385     LOG(astring("link ") + description + " under " + parent_name
00386         + " has extra fields!");
00387   }
00388 
00389   // find the parent for this link.
00390   inner_mark_tree *parent = find_parent(parent_name);
00391   _last_parent = parent;  // set the parent for the next time.
00392 
00393   // see if the link already exists.
00394   int *found = _links_seen->find(url, excellent_link_comparator);
00395   if (found) {
00396     // this is not so great; a duplicate link has been found.
00397     LOG(a_sprintf("Duplicate Link: line %d already has ", *found) + url);
00398     return;
00399   } else {
00400     _links_seen->add(prune_link_down(url), _line_number);
00401   }
00402 
00403   // add the link to the parent.
00404   link_record *new_rec = new link_record(description, url, _line_number);
00405   parent->_links.add(new_rec);
00406 
00407   _link_count++;
00408 }
00409 
00410 void bookmark_tree::process_comment(const astring &current_line_in)
00411 {
00413   astring current_line = current_line_in;
00414 
00415   // output the comment as simple text.
00416 //BASE_LOG("comment case");
00417   if (current_line.contains("http:")) {
00418     astring hold_curr = current_line;
00419     int indy = current_line.find("http:");
00420     hold_curr.zap(0, indy - 1);
00421     current_line = astring("&nbsp; &nbsp; &nbsp; &nbsp; "
00422         "<a href=\"") + hold_curr + "\">" + hold_curr + "</a>";
00423   } else if (current_line.t()) {
00424     // snap the comment character off of the front.
00425     current_line.zap(0, 0);
00426   }
00427 
00428   link_record *new_rec = new link_record(current_line,
00429       astring::empty_string(), _line_number);
00430   _last_node->_links.add(new_rec, false);
00431 }
00432 
00433 int bookmark_tree::read_csv_file(const astring &input_filename)
00434 {
00435   FUNCDEF("read_csv_file");
00436   byte_filer input_file(input_filename, "r");
00437   if (!input_file.good())
00438     non_continuable_error(class_name(), DEADLY_LINE,
00439         "the input file could not be opened");
00440 
00441   string_array items;  // parsed in csv line.
00442   astring current_line;  // read from input file.
00443 
00444   // read the lines in the file, one at a time.
00445   while (input_file.getline(current_line, MAX_LINE_SIZE) > 0) {
00446     _line_number++;  // go to the next line.
00447     // remove the carriage returns first.
00448     while (parser_bits::is_eol(current_line[current_line.end()])) {
00449       current_line.zap(current_line.end(), current_line.end());
00450     }
00451     current_line.strip_spaces();
00452     if (!current_line.length()) {
00453 //      // blank lines get treated as a special case.  they are always added
00454 //      // at the end of the list.
00455 //      process_comment(current_line);
00456       continue;
00457     } else if (current_line[0] == '#') {
00458       // handle a comment in the database.
00459       process_comment(current_line);
00460     } else {
00461       // csv parse the line, since we don't support much else.
00462       bool parsed = list_parsing::parse_csv_line(current_line, items);
00463       if (!parsed)
00464         non_continuable_error(class_name(), DEADLY_LINE,
00465             astring("the line could not be parsed: ") + current_line);
00466       if (!items.length()) {
00467         LOG("bad formatting on this line.");
00468         continue;
00469       }
00470       if (items[0].iequals("C")) {
00471         inner_mark_tree *node = process_category(items);
00472         if (!node) {
00473           LOG(astring("failed to get a node for ") + items[1]);
00474         }
00475       } else if (items[0].iequals("L")) {
00476         process_link(items);
00477       } else {
00478         non_continuable_error(class_name(), DEADLY_LINE,
00479             astring("unknown format in line: ") + current_line);
00480       }
00481     }
00482   }
00483   return 0;
00484 }
00485 
Generated on Sat Jan 28 04:21:58 2012 for hoople2 project by  doxygen 1.6.3