Author: dongy
Date: 2006-04-11 17:28:06 +0000 (Tue, 11 Apr 2006)
New Revision: 8517
Modified:
trunk/apps/DarknetSpiderBot/bot.php
trunk/apps/DarknetSpiderBot/config.php
Log:
my first commit
Modified: trunk/apps/DarknetSpiderBot/bot.php
===================================================================
--- trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 17:17:36 UTC (rev 8516)
+++ trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 17:28:06 UTC (rev 8517)
@@ -1,21 +1,27 @@
<?php
+set_time_limit(60);
require_once('config.php');
//$url = $addresse_fcp.$start_page;
//$url = 'http://www.lemonde.fr/';
-$sitepath = "/SSK at
PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE/Index-21/all.html";
+$sitekey =
'PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE';
+$sitename = 'Index';
+
+$sitepath = "/USK@$sitekey/$sitename";
+
$buffer_file = 'local.html';
$bot = new bot();
-$bot->getDistantFile($buffer_file, $fcp, $sitepath);
-echo 'title: '.$bot->extractTitle();
-$urls = $bot->extractURLs();
-$bot->reconstructURLs($urls, $sitepath);
+$bot->getDistantFile($buffer_file, $fcp_host, $fcp_port, $sitepath.'/-1');
+//$bot->buffer_contents = $bot->getFileContents($buffer_file);
-print_r($urls);
+//$urls = $bot->extractURLs();
+//$bot->cleanURLs($urls, $sitekey, $sitename);
+
+//print_r($urls);
//echo $bot->buffer_contents;
@@ -23,11 +29,30 @@
var $buffer_contents;
- function getDistantFile ($buffer_file, $fcp, $sitepath='')
+ function getDistantFile ($buffer_file, $fcp_host, $fcp_port,
$sitepath='')
{
global $timeout, $wget_dir;
- exec($wget_dir."wget.exe --timeout=$timeout ${fcp}$sitepath -O
$buffer_file");
+ //exec($wget_dir."wget.exe --timeout=$timeout ${fcp}$sitepath
-O $buffer_file");
+
+ $fp = fsockopen($fcp_host, $fcp_port, $errno, $errstr,
$timeout);
+ if (!$fp) {
+ echo "$errstr ($errno)<br />\n";
+ }
+ else
+ {
+ $out = "GET $sitepath HTTP/1.1\r\n";
+ $out .= "Host: $fcp_host\r\n";
+ $out .= "Connection: Close\r\n\r\n";
+
+ fwrite($fp, $out);
+
+ while (!feof($fp)) {
+ echo fgets($fp, 128);
+ }
+ fclose($fp);
+ }
+
$this->buffer_contents = $this->getFileContents($buffer_file);
}
@@ -41,6 +66,11 @@
return $contents;
}
+ function getLastEdition ()
+ {
+
+ }
+
function extractTitle ()
{
if ( preg_match_all('/<title>(.+?)<\/title>/s',
$this->buffer_contents, $title) ) {
@@ -81,32 +111,42 @@
}
- function reconstructURLs (&$urls, $sitepath)
+ function cleanURLs (&$urls, $sitekey, $sitename)
{
-
+ // todo: support des ../
+
foreach ($urls as $key => $value)
{
-
+
+ $value = trim($value);
+
if ( substr($value, 0, 7) == 'http://') // si l'url
commence par http://, on la retire
+ {
$value = '';
-
- if ( substr($value, -1) == '/') // si l'url fini par un
slash, on le retire
- $value = substr($value, 0, -1);
-
- if ( substr($value, 0, 1) != '/') // si ce n'est pas
une url absolue alors
+ }
+ elseif ( substr($value, 0, 1) != '/') // si ce n'est
pas une url absolue alors
{
if ( substr($value, 0, 2) == './') // on enl?ve
?ventuellement ./
$value = substr($value, 2);
// on ajoute $sitepath
- $value = $sitepath.'/'.$value;
+ $value = '/'.$sitekey.'/'.$sitename.'/'.$value;
}
+ if ( substr($value, -1) == '/') // si l'url fini par un
slash, on le retire
+ $value = substr($value, 0, -1);
+
+ // On retire les liens vers les diverses versions
+ if (
preg_match("#^/[A-Z]{3,3}@$sitekey/$sitename/?-[0-9]+$#i", $value, $matches,
PREG_OFFSET_CAPTURE) )
+ $value = '';
+
// mise ? jour de l'url
$urls[$key] = $value;
}
}
+
+
}
Modified: trunk/apps/DarknetSpiderBot/config.php
===================================================================
--- trunk/apps/DarknetSpiderBot/config.php 2006-04-11 17:17:36 UTC (rev
8516)
+++ trunk/apps/DarknetSpiderBot/config.php 2006-04-11 17:28:06 UTC (rev
8517)
@@ -1,9 +1,10 @@
<?php
$timeout = "20";
-$fcp = "http://127.0.0.1:8888";
+$fcp_host = "127.0.0.1";
+$fcp_port = '8888';
-$wget_dir = 'D:\\Darknet\\bin\\';
+$wget_dir = 'c:\\wget\\';
$hostname_bot = "";
$database_bot = "";