bookmark_tree.cpp

Go to the documentation of this file.
00001 /*****************************************************************************\
00002 *                                                                             *
00003 *  Name   : bookmark_tree                                                     *
00004 *  Author : Chris Koeritz                                                     *
00005 *                                                                             *
00006 *******************************************************************************
00007 * Copyright (c) 2005-$now By Author.  This program is free software; you can  *
00008 * redistribute it and/or modify it under the terms of the GNU General Public  *
00009 * License as published by the Free Software Foundation; either version 2 of   *
00010 * the License or (at your option) any later version.  This is online at:      *
00011 *     http://www.fsf.org/copyleft/gpl.html                                    *
00012 * Please send any updates to: fred@gruntose.com                               *
00013 \*****************************************************************************/
00014 
00015 #include "bookmark_tree.h"
00016 
00017 #include <basis/function.h>
00018 #include <basis/guards.h>
00019 #include <basis/istring.h>
00020 #include <data_struct/amorph.cpp>
00021 #include <data_struct/string_table.cpp>
00022 #include <data_struct/symbol_table.cpp>
00023 #include <nodes/symbol_tree.h>
00024 #include <opsystem/byte_filer.h>
00025 #include <loggers/file_logger.h>
00026 #include <opsystem/filename.h>
00027 #include <textual/list_parsing.h>
00028 #include <textual/parser_bits.h>
00029 
00030 using namespace nodes;
00031 
00032 //#define DEBUG_MARKS
00033   // uncomment to have more debugging noise.
00034 
00035 #undef BASE_LOG
00036 #define BASE_LOG(s) program_wide_logger().log(s)
00037 #define SHOW_LINE \
00038    isprintf("line %d: ", _line_number)
00039 #undef LOG
00040 #define LOG(s) CLASS_EMERGENCY_LOG(program_wide_logger(), SHOW_LINE + s)
00041 #define DEADLY_LINE \
00042    (istring(func) + isprintf(", line %d: ", _line_number))
00043 
00044 const int HASHING_PARAMETER = 6;
00045   // we will have 2^n hashing slots, i.e. locations into which things will
00046   // be stored.  for example, if n is 4, then there will be 16 slots, and on
00047   // average a search for an item would take only a 16th of the number of
00048   // comparisons as it would in a simple linear search (assuming a good
00049   // hashing function is used).
00050 
00051 const int MAX_LINE_SIZE = 256 * KILOBYTE;
00052   // the largest line we'll process in the links database.
00053 
00054 // used to compare two strings while ignoring case; we use this to find
00055 // our categories in the symbol table.
00056 bool case_insense_compare(const istring &a, const istring &b)
00057 { return a.iequals(b); }
00058 
00060 
00061 listo_links::listo_links() : amorph<link_record>(), _next_index(0) {}
00062 
00063 void listo_links::add(link_record *new_rec, bool sort)
00064 {
00065   // we don't sort blank lines--they just get dropped in the end of
00066   // the section.
00067   if (sort && new_rec->_description.t()) {
00068     for (int i = _next_index; i < elements(); i++) {
00069       const istring &desc_cur = borrow(i)->_description;
00070       if ( (desc_cur.icompare(new_rec->_description) > 0) || !desc_cur) {
00071         insert(i, 1);
00072         put(i, new_rec);
00073         return;
00074       }
00075     }
00076   }
00077   append(new_rec);
00078   if (!sort)
00079     _next_index = elements();
00080 }
00081 
00083 
00084 class symbol_int : public symbol_table<int>
00085 {
00086 public:
00087   symbol_int() : symbol_table<int>(10) {}
00088 };
00089 
00091 
00092 bookmark_tree::bookmark_tree()
00093 : _line_number(0),
00094   _mark_tree(new inner_mark_tree("Root", 0, HASHING_PARAMETER)),
00095   _link_count(0),
00096   _category_count(0),
00097   _last_parent(_mark_tree),
00098   _last_node(_mark_tree),
00099   _links_seen(new symbol_int),
00100   _category_names(new string_table)
00101 {}
00102 
00103 bookmark_tree::~bookmark_tree()
00104 {
00105   WHACK(_links_seen);
00106   WHACK(_mark_tree);
00107   WHACK(_category_names);
00108 }
00109 
00110 void bookmark_tree::break_name(const istring &to_break, istring &name,
00111     istring &nick)
00112 {
00113   nick = istring::empty_string();
00114   name = to_break;
00115   int indy = name.find('[');
00116   if (negative(indy)) return;
00117   nick = name.substring(indy + 1, name.end());
00118   while ( (nick[nick.end()] == ' ') || (nick[nick.end()] == ']') )
00119     nick.zap(nick.end(), nick.end());
00120   name.zap(indy, name.end());
00121   name.strip_spaces();
00122   nick.strip_spaces();
00123 }
00124 
00125 inner_mark_tree &bookmark_tree::access_root() { return *_mark_tree; }
00126 
00127 bool bookmark_tree::magic_category_comparison(const istring &a,
00128     const istring &b)
00129 {
00130   FUNCDEF("magic_category_comparison");
00131 //LOG(istring("compare: a=") + a + " b=" + b);
00132   if (a.iequals(b)) return true;
00133   istring a_name, a_nick;
00134   break_name(a, a_name, a_nick);
00135   istring b_name, b_nick;
00136   break_name(b, b_name, b_nick);
00137   if (a_name.iequals(b_name)) return true;
00138   if (a_nick.t() && a_nick.iequals(b_name)) return true;
00139   if (b_nick.t() && a_name.iequals(b_nick)) return true;
00140   if (a_nick.t() && b_nick.t() && a_nick.iequals(b_nick)) return true;
00141   return false;
00142 }
00143 
00144 inner_mark_tree *bookmark_tree::find_parent(const istring &parent_name)
00145 {
00146   FUNCDEF("find_parent");
00147   // first, look for the node above the last parent.
00148   inner_mark_tree *parent = dynamic_cast<inner_mark_tree *>
00149       (_last_parent->find(parent_name, inner_mark_tree::recurse_upward,
00150        magic_category_comparison));
00151 
00152 #ifdef DEBUG_MARKS
00153   if (parent)
00154     LOG(istring("trying upwards find for parent node ") + parent_name);
00155 #endif
00156 
00157   if (!parent) {
00158 #ifdef DEBUG_MARKS
00159     LOG(istring("upwards find failed seeking on last_parent node ")
00160         + parent_name);
00161 #endif
00162 
00163     // we didn't find the parent above the last category...
00164     parent = dynamic_cast<inner_mark_tree *>(_last_node->find(parent_name,
00165         inner_mark_tree::recurse_upward, magic_category_comparison));
00166   }
00167 
00168   if (!parent) {
00169 #ifdef DEBUG_MARKS
00170     LOG(istring("upwards find failed seeking on last_node ") + parent_name);
00171 #endif
00172 
00173     // last node didn't help either.
00174     parent = dynamic_cast<inner_mark_tree *>(_mark_tree->find(parent_name,
00175         inner_mark_tree::recurse_downward, magic_category_comparison));
00176   }
00177   if (!parent) {
00178     // failed to find the parent node, so hook it to the root node.
00179     LOG(istring("failed to find parent node ") + parent_name);
00180 
00181     // create a parent node and use it for this guy.
00182     inner_mark_tree *new_node = new inner_mark_tree(parent_name,
00183         _line_number, HASHING_PARAMETER);
00184     _mark_tree->attach(new_node);
00185     _mark_tree->sort();
00186     _category_count++;
00187 
00188     parent = new_node;
00189   } else {
00190 #ifdef DEBUG_MARKS
00191     LOG(istring("found parent node ") + parent_name);
00192 #endif
00193   }
00194 
00195   return parent;
00196 }
00197 
00198 inner_mark_tree *bookmark_tree::process_category(const string_array &items)
00199 {
00200   FUNCDEF("process_category");
00201   const istring &category_name = items[1];
00202   const istring &parent_name = items[2];
00203 
00204   if (items.length() > 3) {
00205     // complain about a possibly malformed category.
00206     LOG(istring("category ") + category_name + " under " + parent_name
00207         + " has extra fields!");
00208   }
00209 
00210 //BASE_LOG("CURRENT:");
00211 //BASE_LOG(_mark_tree->text_form());
00212 
00213   // make sure we don't add anything to the tree if this is the root.
00214   if (!parent_name || magic_category_comparison("Root", category_name)) {
00215 #ifdef DEBUG_MARKS
00216     LOG(istring("skipping parent node for ") + category_name);
00217 #endif
00218     return _mark_tree;
00219   }
00220 
00221   // ensure that the categories aren't competing with other names.
00222   istring main_name, nickname;
00223   break_name(category_name, main_name, nickname);
00224   istring *found1 = _category_names->find(main_name, case_insense_compare);
00225   istring *found2 = _category_names->find(nickname, case_insense_compare);
00226   if (found1 || found2) {
00227     istring catnames;
00228     if (found1) catnames = *found1;  // add the first match, if it exists.
00229     if (found2) {
00230       if (!!catnames) catnames += " and ";
00231       catnames += *found2;
00232     }
00233     LOG(istring("category name \"") + category_name
00234         + "\" in conflict with existing: " + catnames);
00235     inner_mark_tree *fake_it = NIL;
00236 //hmmm: neither of these are right; they need to use a comparator that
00237 //      uses our magic comparison function.
00238 deadly_error(class_name(), func, "collision resolution code not present yet; please fix category error");
00239 return fake_it;
00240     if (found1)
00241       fake_it = (inner_mark_tree *)_mark_tree->find
00242           (*found1, symbol_tree::recurse_downward);
00243     if (fake_it) return fake_it;
00244     if (found2)
00245       fake_it = (inner_mark_tree *)_mark_tree->find
00246           (*found2, symbol_tree::recurse_downward);
00247     if (fake_it) return fake_it;
00248     LOG("  failure to find a match for either category!");
00249     return NIL;
00250   }
00251   // now that we know these names are unique, we'll add them into the list
00252   // so future categories can't reuse these.
00253   _category_names->add(main_name, main_name);
00254   if (!!nickname) _category_names->add(nickname, nickname);
00255 
00256   inner_mark_tree *parent = find_parent(parent_name);
00257   _last_parent = parent;  // set the parent for the next time.
00258 
00259   // see if the category is already present under the parent.
00260   for (int i = 0; i < parent->branches(); i++) {
00261     inner_mark_tree *curr = dynamic_cast<inner_mark_tree *>(parent->branch(i));
00262     if (!curr)
00263       non_continuable_error(class_name(), DEADLY_LINE, "missing branch in tree");
00264 #ifdef DEBUG_MARKS
00265     LOG(istring("looking at branch ") + curr->name());
00266 #endif
00267     if (magic_category_comparison(curr->name(), category_name)) {
00268       // it already exists?  argh.
00269       LOG(istring("category ") + category_name + " already exists under "
00270           + parent_name + ".");
00271       _last_node = curr;
00272       return curr;
00273     }
00274   }
00275 
00276   inner_mark_tree *new_node = new inner_mark_tree(category_name,
00277       _line_number, HASHING_PARAMETER);
00278   parent->attach(new_node);
00279   parent->sort();
00280   _last_node = new_node;
00281 
00282   _category_count++;
00283 
00284 #ifdef DEBUG_MARKS
00285   LOG(istring("attaching node ") + category_name + " to parent "
00286       + parent->name());
00287 #endif
00288   return new_node;
00289 }
00290 
00291 void bookmark_tree::process_link(const string_array &items)
00292 {
00293   FUNCDEF("process_link");
00294   istring description = items[1];
00295   istring parent_name = items[2];
00296   istring url = "UNKNOWN";
00297   if (items.length() >= 4) url = items[3];
00298 
00299   // strip any directory slashes that are provided as a suffix.  we don't need
00300   // them and they tend to confuse the issue when we look for duplicates.
00301   while (url[url.end()] == '/') {
00302     url.zap(url.end(), url.end());
00303   }
00304 
00305   // make some noise if they seem to have a badly formed link.
00306   if (items.length() < 4) {
00307     LOG(istring("link ") + description + " under " + parent_name
00308         + " has no URL!");
00309   } else if (items.length() > 4) {
00310     LOG(istring("link ") + description + " under " + parent_name
00311         + " has extra fields!");
00312   }
00313 
00314   // find the parent for this link.
00315   inner_mark_tree *parent = find_parent(parent_name);
00316   _last_parent = parent;  // set the parent for the next time.
00317 
00318   // see if the link already exists.
00319   istring url_copy = url.lower();
00320     // make a copy in lower-case so we ignore case differences.
00321   int *found = _links_seen->find(url_copy);
00322   if (found) {
00323     // this is not so great; a duplicate link has been found.
00324     LOG(isprintf("Duplicate Link: line %d already has ", *found) + url);
00325     return;
00326   } else {
00327     _links_seen->add(url_copy, _line_number);
00328   }
00329 
00330   // add the link to the parent.
00331   link_record *new_rec = new link_record(description, url, _line_number);
00332   parent->_links.add(new_rec);
00333 
00334   _link_count++;
00335 }
00336 
00337 void bookmark_tree::process_comment(const istring &current_line_in)
00338 {
00339   FUNCDEF("process_comment");
00340   istring current_line = current_line_in;
00341 
00342   // output the comment as simple text.
00343 //BASE_LOG("comment case");
00344   if (current_line.contains("http:")) {
00345     istring hold_curr = current_line;
00346     int indy = current_line.find("http:");
00347     hold_curr.zap(0, indy - 1);
00348     current_line = istring("&nbsp; &nbsp; &nbsp; &nbsp; "
00349         "<a href=\"") + hold_curr + "\">" + hold_curr + "</a>";
00350   } else if (current_line.t()) {
00351     // snap the comment character off of the front.
00352     current_line.zap(0, 0);
00353   }
00354 
00355   link_record *new_rec = new link_record(current_line,
00356       istring::empty_string(), _line_number);
00357   _last_node->_links.add(new_rec, false);
00358 }
00359 
00360 int bookmark_tree::read_csv_file(const istring &input_filename)
00361 {
00362   FUNCDEF("read_csv_file");
00363   byte_filer input_file(input_filename, "r");
00364   if (!input_file.good())
00365     non_continuable_error(class_name(), DEADLY_LINE,
00366         "the input file could not be opened");
00367 
00368   string_array items;  // parsed in csv line.
00369   istring current_line;  // read from input file.
00370 
00371   // read the lines in the file, one at a time.
00372   while (input_file.getline(current_line, MAX_LINE_SIZE) > 0) {
00373     _line_number++;  // go to the next line.
00374     // remove the carriage returns first.
00375     while (parser_bits::is_eol(current_line[current_line.end()])) {
00376       current_line.zap(current_line.end(), current_line.end());
00377     }
00378     current_line.strip_spaces();
00379     if (!current_line.length()) {
00380 //      // blank lines get treated as a special case.  they are always added
00381 //      // at the end of the list.
00382 //      process_comment(current_line);
00383       continue;
00384     } else if (current_line[0] == '#') {
00385       // handle a comment in the database.
00386       process_comment(current_line);
00387     } else {
00388       // csv parse the line, since we don't support much else.
00389       bool parsed = list_parsing::parse_csv_line(current_line, items);
00390       if (!parsed)
00391         non_continuable_error(class_name(), DEADLY_LINE,
00392             istring("the line could not be parsed: ") + current_line);
00393       if (!items.length()) {
00394         LOG("bad formatting on this line.");
00395         continue;
00396       }
00397       if (items[0].iequals("C")) {
00398         inner_mark_tree *node = process_category(items);
00399         if (!node) {
00400           LOG(istring("failed to get a node for ") + items[1]);
00401         }
00402       } else if (items[0].iequals("L")) {
00403         process_link(items);
00404       } else {
00405         non_continuable_error(class_name(), DEADLY_LINE,
00406             istring("unknown format in line: ") + current_line);
00407       }
00408     }
00409   }
00410   return 0;
00411 }
00412 

Generated on Sat Aug 30 04:31:25 2008 for HOOPLE Libraries by  doxygen 1.5.1