marks_checker.cpp

Go to the documentation of this file.
00001 /*****************************************************************************\
00002 *                                                                             *
00003 *  Name   : marks_checker                                                     *
00004 *  Author : Chris Koeritz                                                     *
00005 *                                                                             *
00006 *  Purpose:                                                                   *
00007 *                                                                             *
00008 *    Checks on the existence of the links listed in a HOOPLE format link      *
00009 *  database and reports the bad ones.                                         *
00010 *                                                                             *
00011 *******************************************************************************
00012 * Copyright (c) 2005-$now By Author.  This program is free software; you can  *
00013 * redistribute it and/or modify it under the terms of the GNU General Public  *
00014 * License as published by the Free Software Foundation; either version 2 of   *
00015 * the License or (at your option) any later version.  This is online at:      *
00016 *     http://www.fsf.org/copyleft/gpl.html                                    *
00017 * Please send any updates to: fred@gruntose.com                               *
00018 \*****************************************************************************/
00019 
00020 #include "bookmark_tree.cpp"
00021 
00022 #include <basis/auto_synch.h>
00023 #include <basis/chaos.h>
00024 #include <basis/function.h>
00025 #include <basis/guards.h>
00026 #include <basis/istring.h>
00027 #include <basis/log_base.h>
00028 #include <basis/mutex.h>
00029 #include <basis/shell_sort.h>
00030 #include <data_struct/unique_id.h>
00031 #include <mechanisms/ithread.h>
00032 #include <mechanisms/thread_cabinet.h>
00033 #include <opsystem/application_shell.h>
00034 #include <opsystem/byte_filer.h>
00035 #include <opsystem/command_line.h>
00036 #include <loggers/file_logger.h>
00037 #include <opsystem/filename.h>
00038 #include <data_struct/static_memory_gremlin.h>
00039 
00040 #include <curl/curl.h>
00041 #include <signal.h>
00042 #include <stdlib.h>
00043 
00044 using namespace nodes;
00045 
00046 //#define DEBUG_MARKS
00047   // uncomment to have more debugging noise.
00048 
00049 #undef BASE_LOG
00050 #define BASE_LOG(s) program_wide_logger().log(s)
00051 
00052 const int PAUSEY_SNOOZE = 200;
00053   // how long we sleep if there are too many threads running already.
00054 
00055 const int MAXIMUM_THREADS = 14;
00056   // we allow this many simultaneous web checks at a time.
00057 
00058 const int MAXIMUM_READ = 1008;
00059   // we only download this much of the link.  this avoids huge downloads of
00060   // very large sites.
00061 
00062 const int MAXIMUM_ATTEMPTS = 2;
00063   // we'll retry the check if we get an actual error instead of an http error
00064   // code.  when a name can't be found in the DNS, it sometimes comes back
00065   // shortly after it was checked.  if we see we can't reach the domain after
00066   // this many tries, then we give up on the address.
00067 
00068 const int TIME_PER_REQUEST_IN_SEC = 60 * 6;
00069   // limit our requests to this long of a period.  then we will not be
00070   // stalled forever by uncooperative websites.
00071 
00072 const char *FAKE_AGENT_STRING = "Mozilla/5.0 (X11; U; Linux i686; "
00073     "en-US; rv:1.7.12) Gecko/20050921";
00074   // we use this as our agent type, since some sites won't treat us fairly
00075   // if they think we're robots when we're checking their site health.
00076   // for example (ahem!), the usa today websites.
00077 
00079 
00080 class safe_int_array
00081 {
00082 public:
00083   safe_int_array() : _lock(), _list(0) {}
00084 
00085   void add(int to_add) {
00087     auto_synchronizer l(_lock);
00088     _list += to_add;
00089   }
00090 
00091   int length() {
00092     auto_synchronizer l(_lock);
00093     return _list.length();
00094   }
00095 
00096   int_array make_copy() {
00097     auto_synchronizer l(_lock);
00098     return _list;
00099   }
00100 
00101 private:
00102   mutex _lock;
00103   int_array _list;
00104 };
00105 
00107 
00108 class marks_checker : public application_shell
00109 {
00110 public:
00111   marks_checker()
00112       : application_shell(static_class_name()), _check_redirection(false),
00113         _max_threads(MAXIMUM_THREADS), _null_file(portable::null_device(), "w")
00114   {}
00115 
00116   IMPLEMENT_CLASS_NAME("marks_checker");
00117   virtual int execute();
00118   int print_instructions(const filename &program_name);
00119 
00120   int test_all_links();
00121     // goes through the tree of links and tests them all.
00122   
00123   int check_link(const istring &url, istring &error_msg);
00124     // synchronously checks the "url" for health.  the return value is zero
00125     // on success or an HTTP error code on failure.
00126 
00127   void write_new_files();
00128     // writes out the two new files given the info accumulated so far.
00129 
00130 private:
00131   bookmark_tree _categories;  // our tree of categories.
00132   safe_int_array _bad_lines;  // lines with bad contents.
00133   thread_cabinet _checkers;  // threads checking on links.
00134   istring _input_filename;  // we'll store our link database name here.
00135   istring _output_filename;  // where the list of good links is stored.
00136   istring _bad_link_filename;  // garbage dump of bad links.
00137   bool _check_redirection;  // true if redirection is disallowed.
00138   int _max_threads;  // the most threads we'll allow at once.
00139   byte_filer _null_file;  // we'll use this for trashing output data.
00140 
00141   static void handle_OS_signal(int sig_id);
00142     // handles break signals from the user.
00143 };
00144 
00146 
00147 class checking_thread : public ithread
00148 {
00149 public:
00150   checking_thread(const link_record &link_info, safe_int_array &bad_lines,
00151       marks_checker &checker)
00152   : ithread(), _bad_lines(bad_lines), _checker(checker), _info(link_info) {}
00153 
00154   void perform_activity(void *formal(ptr)) {
00155     istring message;
00156     int ret = _checker.check_link(_info._url, message);
00157     if (ret != 0) {
00158       istring complaint = isprintf("Bad Link at line %d:", _info._uid)
00159           += log_base::platform_ending();
00160       const istring spacer(' ', 4);
00161       complaint += spacer + _info._url += log_base::platform_ending();
00162       complaint += spacer + _info._description += log_base::platform_ending();
00163       complaint += spacer + "error: " += message;
00164       BASE_LOG(complaint);
00165 if ( (_info._uid> 100000) || (_info._uid < 0) ) {
00166 BASE_LOG(isprintf("somehow got bogus line number!  %d", _info._uid));
00167 return;
00168 }
00169       _bad_lines.add(_info._uid);  // list ours as bad.
00170     }
00171   }
00172 
00173 private:
00174   safe_int_array &_bad_lines;
00175   marks_checker &_checker;
00176   link_record _info;
00177 };
00178 
00180 
00181 int marks_checker::print_instructions(const filename &program_name)
00182 {
00183   isprintf to_show("%s:\n\
00184 This program needs three filenames as command line parameters.  The -i flag\n\
00185 is used to specify the input filename. The -o flag specifies the file where\n\
00186 where the good links will be written.  The -b flag specifies the file where\n\
00187 the bad links are written.  The optional flag --no-redirs can be used to\n\
00188 disallow web-site redirection, which will catch when the site has changed\n\
00189 its location.  Note that redirection is not necessarily an error, but it\n\
00190 instead may just be a link that needs its URL modified.  It is recommended\n\
00191 that you omit this flag in early runs, in order to only locate definitely\n\
00192 dead links.  Then later checking runs can find any sites that were redirected\n\
00193 or being routed to a dead link page which doesn't provide an error code.\n\
00194 The optional flag --threads with a parameter will set the maximum number of\n\
00195 threads that will simultaneously check on links.\n\
00196 The input file is expected to be in the HOOPLE link database format.\n\
00197 The HOOPLE link format is documented here:\n\
00198     http://hoople.org/guides/link_database/format_manifesto.txt\n\
00199 ", program_name.basename().raw().s(), program_name.basename().raw().s());
00200   program_wide_logger().log(to_show.s());
00201   return 12;
00202 }
00203 
00204 // this function just eats any data it's handed.
00205 size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
00206 { return size * number; }
00207 
00208 int marks_checker::check_link(const istring &url, istring &error_msg)
00209 {
00210   int to_return = -1;
00211 
00212   CURL *cur = curl_easy_init();
00213 
00214   curl_easy_setopt(cur, CURLOPT_URL, url.s());  // set the URL itself.
00215 
00216   curl_easy_setopt(cur, CURLOPT_SSL_VERIFYPEER, 0);
00217     // don't verify SSL certificates.
00218   curl_easy_setopt(cur, CURLOPT_MAXFILESIZE, MAXIMUM_READ);
00219     // limit the download size; causes size errors, which we elide to success.
00220   curl_easy_setopt(cur, CURLOPT_NOSIGNAL, 1);
00221     // don't use signals since it interferes with sleep.
00222   curl_easy_setopt(cur, CURLOPT_TIMEOUT, TIME_PER_REQUEST_IN_SEC);
00223     // limit time allowed per operation.
00224   curl_easy_setopt(cur, CURLOPT_AUTOREFERER, true);
00225     // automatically fill in the referer field when redirected.
00226 
00227   curl_easy_setopt(cur, CURLOPT_WRITEDATA, _null_file.file_handle());
00228     // set the file handle where we want our downloaded data to go.  since
00229     // we're just checking the links, this goes right to the trash.
00230   curl_easy_setopt(cur, CURLOPT_WRITEFUNCTION, data_sink);
00231     // set the function which will be given all the downloaded data.
00232 
00233   curl_easy_setopt(cur, CURLOPT_USERAGENT, FAKE_AGENT_STRING);
00234     // fake being a browser here since otherwise we get no respect.
00235 
00236   curl_easy_setopt(cur, CURLOPT_FTPLISTONLY, 1);
00237     // get only a simple list of files, which allows us to hit ftp sites
00238     // properly.  if the normal curl mode is used, we get nothing.
00239 
00240   if (_check_redirection) {
00241     // attempting to quash redirects as being valid.
00242     curl_easy_setopt(cur, CURLOPT_FOLLOWLOCATION, 1);  // follow redirects.
00243     curl_easy_setopt(cur, CURLOPT_MAXREDIRS, 0);  // allow zero redirects.
00244   }
00245 
00246   int tries = 0;
00247   while (tries++ < MAXIMUM_ATTEMPTS) {
00248 
00249     // we do the error message again every time, since it gets shrunk after
00250     // the web page check and is no longer available where it was in memory.
00251     error_msg = istring(' ', CURL_ERROR_SIZE + 5);
00252     curl_easy_setopt(cur, CURLOPT_ERRORBUFFER, error_msg.s());
00253 
00254     // set the error message buffer so we know what happened.
00255 
00256     // try to lookup the web page we've been given.
00257     to_return = curl_easy_perform(cur);
00258 
00259     error_msg.shrink();  // just use the message without extra spaces.
00260 
00261     // we turn file size errors into non-errors, since we have set a very
00262     // low file size in order to avoid downloading too much.  we really just
00263     // want to check links, not download their contents.
00264     if (to_return == CURLE_FILESIZE_EXCEEDED) to_return = 0;
00265 
00266     if (!to_return) {
00267       // supposedly this is a success, but let's check the result code.
00268       long result;
00269       curl_easy_getinfo(cur, CURLINFO_RESPONSE_CODE, &result);
00270       if (result >= 400) {
00271         error_msg = isprintf("received http failure code %d", result);
00272         to_return = -1;
00273       }
00274       break;  // this was a successful result, a zero outcome from perform.
00275     }
00276 
00277     portable::sleep_ms(10 * SECOND_ms);  // give it a few more seconds...
00278   }
00279 
00280   curl_easy_cleanup(cur);
00281 
00282   return to_return;
00283 }
00284 
00285 int marks_checker::test_all_links()
00286 {
00287   FUNCDEF("test_all_links");
00288   // traverse the tree in prefix order.
00289   tree::iterator itty = _categories.access_root().start(tree::prefix);
00290   tree *curr = NIL;
00291   while ( (curr = itty.next()) ) {
00292     inner_mark_tree *nod = dynamic_cast<inner_mark_tree *>(curr);
00293     if (!nod)
00294       non_continuable_error(static_class_name(), func, "failed to cast a tree node");
00295     // iterate on all the links at this node to check them.
00296     for (int i = 0; i < nod->_links.elements(); i++) {
00297       link_record *lin = nod->_links.borrow(i);
00298       if (!lin->_url) continue;  // not a link.
00299 
00300       while (_checkers.threads() > _max_threads) {
00301         portable::sleep_ms(PAUSEY_SNOOZE);
00302         _checkers.clean_debris();
00303       }
00304       
00305       checking_thread *new_thread = new checking_thread(*lin, _bad_lines,
00306           *this);
00307       unique_int id = _checkers.add_thread(new_thread, true, NIL);
00308     }
00309   }
00310 
00311 BASE_LOG("... finished iterating on tree.");
00312 
00313   // now wait until all the threads are finished.  
00314   while (_checkers.threads()) {
00315     portable::sleep_ms(PAUSEY_SNOOZE);
00316     _checkers.clean_debris();
00317   }
00318   
00319 BASE_LOG("... finished waiting for all threads.");
00320 
00321   return 0;
00322 }
00323 
00324 void marks_checker::write_new_files()
00325 {
00326   byte_filer input_file(_input_filename, "r");
00327   byte_filer output_file(_output_filename, "w");
00328   byte_filer badness_file(_bad_link_filename, "w");
00329 
00330   int_array badness = _bad_lines.make_copy();
00331   shell_sort<int>(badness.access(), badness.length());
00332 
00333   BASE_LOG("bad links are on lines:");
00334   istring bad_list;
00335     for (int i = 0; i < badness.length(); i++) {
00336     bad_list += isprintf("%d, ", badness[i]);
00337   }
00338   BASE_LOG(bad_list);
00339 
00340   istring buffer;
00341   int curr_line = 0;
00342   while (!input_file.eof()) {
00343     curr_line++;
00344     while (badness.length() && (badness[0] < curr_line) ) {
00345       BASE_LOG(isprintf("whacking too low line number: %d", badness[0]));
00346       badness.zap(0, 0);
00347     }
00348     input_file.getline(buffer, 2048);
00349 //make that a constant.
00350     if (badness.length() && (badness[0] == curr_line)) {
00351       // we seem to have found a bad line.
00352       badness_file.write(buffer);
00353       badness.zap(0, 0);  // remove the current line number.
00354     } else {
00355       // this is a healthy line.
00356       output_file.write(buffer);
00357     }
00358     
00359   }
00360   input_file.close();
00361   output_file.close();
00362   badness_file.close();
00363 }
00364 
00365 marks_checker *main_program = NIL;
00366 
00367 void marks_checker::handle_OS_signal(int formal(sig_id))
00368 {
00369   signal(SIGINT, SIG_IGN);  // turn off that signal for now.
00370   BASE_LOG("caught break signal...  now writing files.");
00371   if (main_program) main_program->write_new_files();
00372   BASE_LOG("exiting after handling break.");
00373   main_program = NIL;
00374   exit(0);
00375 }
00376 
00377 int marks_checker::execute()
00378 {
00379   FUNCDEF("main");
00380   SET_DEFAULT_COMBO_LOGGER;
00381 
00382   main_program = this;  // used by our signal handler.
00383 
00384   command_line cmds(__argc, __argv);  // process the command line parameters.
00385   if (!cmds.get_value('i', _input_filename, false))
00386     return print_instructions(cmds.program_name());
00387   if (!cmds.get_value('o', _output_filename, false))
00388     return print_instructions(cmds.program_name());
00389   if (!cmds.get_value('b', _bad_link_filename, false))
00390     return print_instructions(cmds.program_name());
00391 
00392   istring temp;
00393 
00394   // optional flag for checking website redirection.
00395   if (cmds.get_value("no-redirs", temp, false)) {
00396     BASE_LOG("Enabling redirection checking: redirected web sites are reported as bad.");
00397     _check_redirection = true;
00398   }
00399   // optional flag for number of threads.
00400   istring threads;
00401   if (cmds.get_value("threads", threads, false)) {
00402     _max_threads = threads.convert(0);
00403     BASE_LOG(isprintf("Maximum threads allowed=%d", _max_threads));
00404   }
00405 
00406   BASE_LOG(istring("input file: ") + _input_filename);
00407   BASE_LOG(istring("output file: ") + _output_filename);
00408   BASE_LOG(istring("bad link file: ") + _bad_link_filename);
00409 
00410 //hmmm: check if output file already exists.
00411 //hmmm: check if bad file already exists.
00412 
00413 BASE_LOG("before reading input...");
00414 
00415   int ret = _categories.read_csv_file(_input_filename);
00416   if (ret) return ret;  // failure during read means we can't do much.
00417 
00418 BASE_LOG("after reading input...");
00419 
00420   signal(SIGINT, handle_OS_signal);
00421     // hook the break signal so we can still do part of the job if they
00422     // interrupt us.
00423 
00424   curl_global_init(CURL_GLOBAL_ALL);  // crank up the cURL engine.
00425 
00426   ret = test_all_links();
00427   
00428   write_new_files();
00429   main_program = NIL;
00430 
00431   curl_global_cleanup();  // shut down cURL engine again.
00432 
00433   return 0;
00434 }
00435 
00437 
00438 HOOPLE_MAIN(marks_checker, )
00439 
00440 #ifdef __BUILD_STATIC_APPLICATION__
00441   // static dependencies found by buildor_gen_deps.sh:
00442   #include <basis/array.cpp>
00443   #include <basis/byte_array.cpp>
00444   #include <basis/callstack_tracker.cpp>
00445   #include <basis/chaos.cpp>
00446   #include <basis/convert_utf.cpp>
00447   #include <basis/definitions.cpp>
00448   #include <basis/earth_time.cpp>
00449   #include <basis/guards.cpp>
00450   #include <basis/istring.cpp>
00451   #include <basis/log_base.cpp>
00452   #include <basis/memory_checker.cpp>
00453   #include <basis/mutex.cpp>
00454   #include <basis/object_base.cpp>
00455   #include <basis/outcome.cpp>
00456   #include <basis/packable.cpp>
00457   #include <basis/portable.cpp>
00458   #include <basis/sequence.cpp>
00459   #include <basis/set.cpp>
00460   #include <basis/utility.cpp>
00461   #include <basis/version_record.cpp>
00462   #include <data_struct/amorph.cpp>
00463   #include <data_struct/bit_vector.cpp>
00464   #include <data_struct/byte_hasher.cpp>
00465   #include <data_struct/configurator.cpp>
00466   #include <data_struct/hash_table.cpp>
00467   #include <data_struct/pointer_hash.cpp>
00468   #include <data_struct/stack.cpp>
00469   #include <data_struct/static_memory_gremlin.cpp>
00470   #include <data_struct/string_hash.cpp>
00471   #include <data_struct/string_hasher.cpp>
00472   #include <data_struct/string_table.cpp>
00473   #include <data_struct/symbol_table.cpp>
00474   #include <data_struct/table_configurator.cpp>
00475   #include <loggers/console_logger.cpp>
00476   #include <loggers/file_logger.cpp>
00477   #include <loggers/locked_logger.cpp>
00478   #include <loggers/null_logger.cpp>
00479   #include <loggers/program_wide_logger.cpp>
00480   #include <mechanisms/ithread.cpp>
00481   #include <mechanisms/roller.cpp>
00482   #include <mechanisms/thread_cabinet.cpp>
00483   #include <mechanisms/time_stamp.cpp>
00484   #include <nodes/node.cpp>
00485   #include <nodes/path.cpp>
00486   #include <nodes/symbol_tree.cpp>
00487   #include <nodes/tree.cpp>
00488   #include <opsystem/application_base.cpp>
00489   #include <opsystem/application_shell.cpp>
00490   #include <opsystem/byte_filer.cpp>
00491   #include <opsystem/command_line.cpp>
00492   #include <opsystem/critical_events.cpp>
00493   #include <opsystem/directory.cpp>
00494   #include <opsystem/filename.cpp>
00495   #include <opsystem/ini_config.cpp>
00496   #include <opsystem/ini_parser.cpp>
00497   #include <opsystem/path_configuration.cpp>
00498   #include <opsystem/rendezvous.cpp>
00499   #include <textual/byte_format.cpp>
00500   #include <textual/list_parsing.cpp>
00501   #include <textual/parser_bits.cpp>
00502   #include <textual/string_manipulation.cpp>
00503   #include <textual/tokenizer.cpp>
00504 #endif // __BUILD_STATIC_APPLICATION__
00505 

Generated on Fri Nov 28 04:28:48 2008 for HOOPLE Libraries by  doxygen 1.5.1