00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020 #include "bookmark_tree.cpp"
00021
00022 #include <basis/auto_synch.h>
00023 #include <basis/chaos.h>
00024 #include <basis/function.h>
00025 #include <basis/guards.h>
00026 #include <basis/istring.h>
00027 #include <basis/log_base.h>
00028 #include <basis/mutex.h>
00029 #include <basis/shell_sort.h>
00030 #include <data_struct/unique_id.h>
00031 #include <mechanisms/ithread.h>
00032 #include <mechanisms/thread_cabinet.h>
00033 #include <opsystem/application_shell.h>
00034 #include <opsystem/byte_filer.h>
00035 #include <opsystem/command_line.h>
00036 #include <loggers/file_logger.h>
00037 #include <opsystem/filename.h>
00038 #include <data_struct/static_memory_gremlin.h>
00039
00040 #include <curl/curl.h>
00041 #include <signal.h>
00042 #include <stdlib.h>
00043
00044 using namespace nodes;
00045
00046
00047
00048
00049 #undef BASE_LOG
00050 #define BASE_LOG(s) program_wide_logger().log(s)
00051
00052 const int PAUSEY_SNOOZE = 200;
00053
00054
00055 const int MAXIMUM_THREADS = 14;
00056
00057
00058 const int MAXIMUM_READ = 1008;
00059
00060
00061
00062 const int MAXIMUM_ATTEMPTS = 2;
00063
00064
00065
00066
00067
00068 const int TIME_PER_REQUEST_IN_SEC = 60 * 6;
00069
00070
00071
00072 const char *FAKE_AGENT_STRING = "Mozilla/5.0 (X11; U; Linux i686; "
00073 "en-US; rv:1.7.12) Gecko/20050921";
00074
00075
00076
00077
00079
00080 class safe_int_array
00081 {
00082 public:
00083 safe_int_array() : _lock(), _list(0) {}
00084
00085 void add(int to_add) {
00087 auto_synchronizer l(_lock);
00088 _list += to_add;
00089 }
00090
00091 int length() {
00092 auto_synchronizer l(_lock);
00093 return _list.length();
00094 }
00095
00096 int_array make_copy() {
00097 auto_synchronizer l(_lock);
00098 return _list;
00099 }
00100
00101 private:
00102 mutex _lock;
00103 int_array _list;
00104 };
00105
00107
00108 class marks_checker : public application_shell
00109 {
00110 public:
00111 marks_checker()
00112 : application_shell(static_class_name()), _check_redirection(false),
00113 _max_threads(MAXIMUM_THREADS), _null_file(portable::null_device(), "w")
00114 {}
00115
00116 IMPLEMENT_CLASS_NAME("marks_checker");
00117 virtual int execute();
00118 int print_instructions(const filename &program_name);
00119
00120 int test_all_links();
00121
00122
00123 int check_link(const istring &url, istring &error_msg);
00124
00125
00126
00127 void write_new_files();
00128
00129
00130 private:
00131 bookmark_tree _categories;
00132 safe_int_array _bad_lines;
00133 thread_cabinet _checkers;
00134 istring _input_filename;
00135 istring _output_filename;
00136 istring _bad_link_filename;
00137 bool _check_redirection;
00138 int _max_threads;
00139 byte_filer _null_file;
00140
00141 static void handle_OS_signal(int sig_id);
00142
00143 };
00144
00146
00147 class checking_thread : public ithread
00148 {
00149 public:
00150 checking_thread(const link_record &link_info, safe_int_array &bad_lines,
00151 marks_checker &checker)
00152 : ithread(), _bad_lines(bad_lines), _checker(checker), _info(link_info) {}
00153
00154 void perform_activity(void *formal(ptr)) {
00155 istring message;
00156 int ret = _checker.check_link(_info._url, message);
00157 if (ret != 0) {
00158 istring complaint = isprintf("Bad Link at line %d:", _info._uid)
00159 += log_base::platform_ending();
00160 const istring spacer(' ', 4);
00161 complaint += spacer + _info._url += log_base::platform_ending();
00162 complaint += spacer + _info._description += log_base::platform_ending();
00163 complaint += spacer + "error: " += message;
00164 BASE_LOG(complaint);
00165 if ( (_info._uid> 100000) || (_info._uid < 0) ) {
00166 BASE_LOG(isprintf("somehow got bogus line number! %d", _info._uid));
00167 return;
00168 }
00169 _bad_lines.add(_info._uid);
00170 }
00171 }
00172
00173 private:
00174 safe_int_array &_bad_lines;
00175 marks_checker &_checker;
00176 link_record _info;
00177 };
00178
00180
00181 int marks_checker::print_instructions(const filename &program_name)
00182 {
00183 isprintf to_show("%s:\n\
00184 This program needs three filenames as command line parameters. The -i flag\n\
00185 is used to specify the input filename. The -o flag specifies the file where\n\
00186 where the good links will be written. The -b flag specifies the file where\n\
00187 the bad links are written. The optional flag --no-redirs can be used to\n\
00188 disallow web-site redirection, which will catch when the site has changed\n\
00189 its location. Note that redirection is not necessarily an error, but it\n\
00190 instead may just be a link that needs its URL modified. It is recommended\n\
00191 that you omit this flag in early runs, in order to only locate definitely\n\
00192 dead links. Then later checking runs can find any sites that were redirected\n\
00193 or being routed to a dead link page which doesn't provide an error code.\n\
00194 The optional flag --threads with a parameter will set the maximum number of\n\
00195 threads that will simultaneously check on links.\n\
00196 The input file is expected to be in the HOOPLE link database format.\n\
00197 The HOOPLE link format is documented here:\n\
00198 http://hoople.org/guides/link_database/format_manifesto.txt\n\
00199 ", program_name.basename().raw().s(), program_name.basename().raw().s());
00200 program_wide_logger().log(to_show.s());
00201 return 12;
00202 }
00203
00204
00205 size_t data_sink(void *formal(ptr), size_t size, size_t number, void *formal(stream))
00206 { return size * number; }
00207
00208 int marks_checker::check_link(const istring &url, istring &error_msg)
00209 {
00210 int to_return = -1;
00211
00212 CURL *cur = curl_easy_init();
00213
00214 curl_easy_setopt(cur, CURLOPT_URL, url.s());
00215
00216 curl_easy_setopt(cur, CURLOPT_SSL_VERIFYPEER, 0);
00217
00218 curl_easy_setopt(cur, CURLOPT_MAXFILESIZE, MAXIMUM_READ);
00219
00220 curl_easy_setopt(cur, CURLOPT_NOSIGNAL, 1);
00221
00222 curl_easy_setopt(cur, CURLOPT_TIMEOUT, TIME_PER_REQUEST_IN_SEC);
00223
00224 curl_easy_setopt(cur, CURLOPT_AUTOREFERER, true);
00225
00226
00227 curl_easy_setopt(cur, CURLOPT_WRITEDATA, _null_file.file_handle());
00228
00229
00230 curl_easy_setopt(cur, CURLOPT_WRITEFUNCTION, data_sink);
00231
00232
00233 curl_easy_setopt(cur, CURLOPT_USERAGENT, FAKE_AGENT_STRING);
00234
00235
00236 curl_easy_setopt(cur, CURLOPT_FTPLISTONLY, 1);
00237
00238
00239
00240 if (_check_redirection) {
00241
00242 curl_easy_setopt(cur, CURLOPT_FOLLOWLOCATION, 1);
00243 curl_easy_setopt(cur, CURLOPT_MAXREDIRS, 0);
00244 }
00245
00246 int tries = 0;
00247 while (tries++ < MAXIMUM_ATTEMPTS) {
00248
00249
00250
00251 error_msg = istring(' ', CURL_ERROR_SIZE + 5);
00252 curl_easy_setopt(cur, CURLOPT_ERRORBUFFER, error_msg.s());
00253
00254
00255
00256
00257 to_return = curl_easy_perform(cur);
00258
00259 error_msg.shrink();
00260
00261
00262
00263
00264 if (to_return == CURLE_FILESIZE_EXCEEDED) to_return = 0;
00265
00266 if (!to_return) {
00267
00268 long result;
00269 curl_easy_getinfo(cur, CURLINFO_RESPONSE_CODE, &result);
00270 if (result >= 400) {
00271 error_msg = isprintf("received http failure code %d", result);
00272 to_return = -1;
00273 }
00274 break;
00275 }
00276
00277 portable::sleep_ms(10 * SECOND_ms);
00278 }
00279
00280 curl_easy_cleanup(cur);
00281
00282 return to_return;
00283 }
00284
00285 int marks_checker::test_all_links()
00286 {
00287 FUNCDEF("test_all_links");
00288
00289 tree::iterator itty = _categories.access_root().start(tree::prefix);
00290 tree *curr = NIL;
00291 while ( (curr = itty.next()) ) {
00292 inner_mark_tree *nod = dynamic_cast<inner_mark_tree *>(curr);
00293 if (!nod)
00294 non_continuable_error(static_class_name(), func, "failed to cast a tree node");
00295
00296 for (int i = 0; i < nod->_links.elements(); i++) {
00297 link_record *lin = nod->_links.borrow(i);
00298 if (!lin->_url) continue;
00299
00300 while (_checkers.threads() > _max_threads) {
00301 portable::sleep_ms(PAUSEY_SNOOZE);
00302 _checkers.clean_debris();
00303 }
00304
00305 checking_thread *new_thread = new checking_thread(*lin, _bad_lines,
00306 *this);
00307 unique_int id = _checkers.add_thread(new_thread, true, NIL);
00308 }
00309 }
00310
00311 BASE_LOG("... finished iterating on tree.");
00312
00313
00314 while (_checkers.threads()) {
00315 portable::sleep_ms(PAUSEY_SNOOZE);
00316 _checkers.clean_debris();
00317 }
00318
00319 BASE_LOG("... finished waiting for all threads.");
00320
00321 return 0;
00322 }
00323
00324 void marks_checker::write_new_files()
00325 {
00326 byte_filer input_file(_input_filename, "r");
00327 byte_filer output_file(_output_filename, "w");
00328 byte_filer badness_file(_bad_link_filename, "w");
00329
00330 int_array badness = _bad_lines.make_copy();
00331 shell_sort<int>(badness.access(), badness.length());
00332
00333 BASE_LOG("bad links are on lines:");
00334 istring bad_list;
00335 for (int i = 0; i < badness.length(); i++) {
00336 bad_list += isprintf("%d, ", badness[i]);
00337 }
00338 BASE_LOG(bad_list);
00339
00340 istring buffer;
00341 int curr_line = 0;
00342 while (!input_file.eof()) {
00343 curr_line++;
00344 while (badness.length() && (badness[0] < curr_line) ) {
00345 BASE_LOG(isprintf("whacking too low line number: %d", badness[0]));
00346 badness.zap(0, 0);
00347 }
00348 input_file.getline(buffer, 2048);
00349
00350 if (badness.length() && (badness[0] == curr_line)) {
00351
00352 badness_file.write(buffer);
00353 badness.zap(0, 0);
00354 } else {
00355
00356 output_file.write(buffer);
00357 }
00358
00359 }
00360 input_file.close();
00361 output_file.close();
00362 badness_file.close();
00363 }
00364
00365 marks_checker *main_program = NIL;
00366
00367 void marks_checker::handle_OS_signal(int formal(sig_id))
00368 {
00369 signal(SIGINT, SIG_IGN);
00370 BASE_LOG("caught break signal... now writing files.");
00371 if (main_program) main_program->write_new_files();
00372 BASE_LOG("exiting after handling break.");
00373 main_program = NIL;
00374 exit(0);
00375 }
00376
00377 int marks_checker::execute()
00378 {
00379 FUNCDEF("main");
00380 SET_DEFAULT_COMBO_LOGGER;
00381
00382 main_program = this;
00383
00384 command_line cmds(__argc, __argv);
00385 if (!cmds.get_value('i', _input_filename, false))
00386 return print_instructions(cmds.program_name());
00387 if (!cmds.get_value('o', _output_filename, false))
00388 return print_instructions(cmds.program_name());
00389 if (!cmds.get_value('b', _bad_link_filename, false))
00390 return print_instructions(cmds.program_name());
00391
00392 istring temp;
00393
00394
00395 if (cmds.get_value("no-redirs", temp, false)) {
00396 BASE_LOG("Enabling redirection checking: redirected web sites are reported as bad.");
00397 _check_redirection = true;
00398 }
00399
00400 istring threads;
00401 if (cmds.get_value("threads", threads, false)) {
00402 _max_threads = threads.convert(0);
00403 BASE_LOG(isprintf("Maximum threads allowed=%d", _max_threads));
00404 }
00405
00406 BASE_LOG(istring("input file: ") + _input_filename);
00407 BASE_LOG(istring("output file: ") + _output_filename);
00408 BASE_LOG(istring("bad link file: ") + _bad_link_filename);
00409
00410
00411
00412
00413 BASE_LOG("before reading input...");
00414
00415 int ret = _categories.read_csv_file(_input_filename);
00416 if (ret) return ret;
00417
00418 BASE_LOG("after reading input...");
00419
00420 signal(SIGINT, handle_OS_signal);
00421
00422
00423
00424 curl_global_init(CURL_GLOBAL_ALL);
00425
00426 ret = test_all_links();
00427
00428 write_new_files();
00429 main_program = NIL;
00430
00431 curl_global_cleanup();
00432
00433 return 0;
00434 }
00435
00437
00438 HOOPLE_MAIN(marks_checker, )
00439
00440 #ifdef __BUILD_STATIC_APPLICATION__
00441
00442 #include <basis/array.cpp>
00443 #include <basis/byte_array.cpp>
00444 #include <basis/callstack_tracker.cpp>
00445 #include <basis/chaos.cpp>
00446 #include <basis/convert_utf.cpp>
00447 #include <basis/definitions.cpp>
00448 #include <basis/earth_time.cpp>
00449 #include <basis/guards.cpp>
00450 #include <basis/istring.cpp>
00451 #include <basis/log_base.cpp>
00452 #include <basis/memory_checker.cpp>
00453 #include <basis/mutex.cpp>
00454 #include <basis/object_base.cpp>
00455 #include <basis/outcome.cpp>
00456 #include <basis/packable.cpp>
00457 #include <basis/portable.cpp>
00458 #include <basis/sequence.cpp>
00459 #include <basis/set.cpp>
00460 #include <basis/utility.cpp>
00461 #include <basis/version_record.cpp>
00462 #include <data_struct/amorph.cpp>
00463 #include <data_struct/bit_vector.cpp>
00464 #include <data_struct/byte_hasher.cpp>
00465 #include <data_struct/configurator.cpp>
00466 #include <data_struct/hash_table.cpp>
00467 #include <data_struct/pointer_hash.cpp>
00468 #include <data_struct/stack.cpp>
00469 #include <data_struct/static_memory_gremlin.cpp>
00470 #include <data_struct/string_hash.cpp>
00471 #include <data_struct/string_hasher.cpp>
00472 #include <data_struct/string_table.cpp>
00473 #include <data_struct/symbol_table.cpp>
00474 #include <data_struct/table_configurator.cpp>
00475 #include <loggers/console_logger.cpp>
00476 #include <loggers/file_logger.cpp>
00477 #include <loggers/locked_logger.cpp>
00478 #include <loggers/null_logger.cpp>
00479 #include <loggers/program_wide_logger.cpp>
00480 #include <mechanisms/ithread.cpp>
00481 #include <mechanisms/roller.cpp>
00482 #include <mechanisms/thread_cabinet.cpp>
00483 #include <mechanisms/time_stamp.cpp>
00484 #include <nodes/node.cpp>
00485 #include <nodes/path.cpp>
00486 #include <nodes/symbol_tree.cpp>
00487 #include <nodes/tree.cpp>
00488 #include <opsystem/application_base.cpp>
00489 #include <opsystem/application_shell.cpp>
00490 #include <opsystem/byte_filer.cpp>
00491 #include <opsystem/command_line.cpp>
00492 #include <opsystem/critical_events.cpp>
00493 #include <opsystem/directory.cpp>
00494 #include <opsystem/filename.cpp>
00495 #include <opsystem/ini_config.cpp>
00496 #include <opsystem/ini_parser.cpp>
00497 #include <opsystem/path_configuration.cpp>
00498 #include <opsystem/rendezvous.cpp>
00499 #include <textual/byte_format.cpp>
00500 #include <textual/list_parsing.cpp>
00501 #include <textual/parser_bits.cpp>
00502 #include <textual/string_manipulation.cpp>
00503 #include <textual/tokenizer.cpp>
00504 #endif // __BUILD_STATIC_APPLICATION__
00505