Author: dongy
Date: 2006-04-11 19:07:06 +0000 (Tue, 11 Apr 2006)
New Revision: 8520
Added:
trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php
trunk/apps/DarknetSpiderBot/include/
trunk/apps/DarknetSpiderBot/include/config.inc.php
Removed:
trunk/apps/DarknetSpiderBot/bot.php
trunk/apps/DarknetSpiderBot/config.php
Log:
getLastEdition added
restructuring project
Added: trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php
===================================================================
--- trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php 2006-04-11 18:33:13 UTC
(rev 8519)
+++ trunk/apps/DarknetSpiderBot/DarknetSpiderBot.php 2006-04-11 19:07:06 UTC
(rev 8520)
@@ -0,0 +1,175 @@
+<?php
+set_time_limit(90);
+
+require_once('include/config.inc.php');
+
+
+$buffer_file = 'local.html';
+
+$site_key =
'PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE';
+$site_name = 'Index';
+$last_know_edition = '20';
+
+
+
+$bot = new bot($fcp_host, $fcp_port, $buffer_file);
+
+echo $bot->getLastEdition($site_key, $site_name, $last_know_edition);
+
+//$sitepath = "/USK@$sitekey/$sitename";
+//$bot->getDistantFile($fcp_host, $fcp_port, $sitepath.'/-1');
+
+
+//$urls = $bot->extractURLs();
+//$bot->cleanURLs($urls, $sitekey, $sitename);
+
+//print_r($urls);
+//echo $bot->buffer;
+
+echo "\r\nDarknetSpiderBot is closing...\r\n";
+
+
+
+class bot {
+
+ var $fcp_host;
+ var $fcp_port;
+
+ var $buffer;
+ var $buffer_file;
+
+
+ function bot ($fcp_host, $fcp_port, $buffer_file)
+ {
+ $this->fcp_host = $fcp_host;
+ $this->fcp_port = $fcp_port;
+ $this->buffer_file = $buffer_file;
+ }
+
+ function getDistantFile ($path, $timeout=30)
+ {
+
+ $fp = fsockopen($this->fcp_host, $this->fcp_port, $errno,
$errstr, $timeout);
+ if (!$fp) {
+ echo "$errstr ($errno)<br />\n";
+ }
+ else
+ {
+ $out = "GET $path HTTP/1.1\r\n";
+ $out .= "Host: $fcp_host\r\n";
+ $out .= "Connection: Close\r\n\r\n";
+
+ fwrite($fp, $out);
+
+ while ( !feof($fp) )
+ {
+ $buffer .= fgets($fp, 4096);
+ }
+ fclose($fp);
+ }
+
+ $this->buffer = $buffer;
+ }
+
+ function getFileContents ($filename)
+ {
+
+ $handle = fopen($filename, 'r') or die("Error during open
\"$filename\"");
+ $contents = fread($handle);
+ fclose($handle);
+
+ return $contents;
+ }
+
+ function getLastEdition ($site_key, $site_name, $last_know_edition)
+ {
+ $path = "/USK@$site_key/$site_name/-$last_know_edition";
+ //$path = "/USK@$site_key/$site_name/$last_know_edition";
+
+ $this->getDistantFile($path, 60);
+
+ if ( preg_match('/\nLocation: (.+)/', $this->buffer, $matches) )
+ return $matches[1];
+
+ return false;
+ }
+
+ function extractTitle ()
+ {
+ if ( preg_match_all('/<title>(.+?)<\/title>/s',
$this->buffer_contents, $title) ) {
+ return $title[1][0];
+ }
+ }
+
+ function extractMetas ()
+ {
+ if (preg_match_all('/<meta(.+?)>/si', $this->buffer_contents,
$matches))
+ {
+ foreach ($matches[1] as $value) // contenu de chaque
balise meta
+ {
+ preg_match_all('/ ?(.+?)="(.+?)" ?/si', $value,
$matches2);
+ foreach ($matches2[1] as $key => $value) //
chaque cl?e
+ {
+ if ($value == 'name' || $value ==
'content')
+ $buf[ $matches2[1][$key] ] =
$matches2[2][$key];
+ }
+
+ if ( !empty($buf['name']) &&
!empty($buf['content']) )
+ $meta[$buf['name']] = $buf['content'];
+
+ unset($buf);
+
+ }
+ }
+
+ return $meta;
+
+ }
+
+ function extractURLs ()
+ {
+
+ if ( preg_match_all('/<a href="(.*?)".*>/i',
$this->buffer_contents, $matches) )
+ return $matches[1];
+
+ }
+
+ function cleanURLs (&$urls, $sitekey, $sitename)
+ {
+ // todo: support des ../
+
+ foreach ($urls as $key => $value)
+ {
+
+ $value = trim($value);
+
+ if ( substr($value, 0, 7) == 'http://') // si l'url
commence par http://, on la retire
+ {
+ $value = '';
+ }
+ elseif ( substr($value, 0, 1) != '/') // si ce n'est
pas une url absolue alors
+ {
+ if ( substr($value, 0, 2) == './') // on enl?ve
?ventuellement ./
+ $value = substr($value, 2);
+
+ // on ajoute $sitepath
+ $value = '/'.$sitekey.'/'.$sitename.'/'.$value;
+ }
+
+ if ( substr($value, -1) == '/') // si l'url fini par un
slash, on le retire
+ $value = substr($value, 0, -1);
+
+ // On retire les liens vers les diverses versions
+ if (
preg_match("#^/[A-Z]{3,3}@$sitekey/$sitename/?-[0-9]+$#i", $value, $matches,
PREG_OFFSET_CAPTURE) )
+ $value = '';
+
+ // mise ? jour de l'url
+ $urls[$key] = $value;
+
+ }
+ }
+
+
+}
+
+?>
\ No newline at end of file
Deleted: trunk/apps/DarknetSpiderBot/bot.php
===================================================================
--- trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 18:33:13 UTC (rev 8519)
+++ trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 19:07:06 UTC (rev 8520)
@@ -1,198 +0,0 @@
-<?php
-set_time_limit(60);
-
-require_once('config.php');
-
-//$url = $addresse_fcp.$start_page;
-//$url = 'http://www.lemonde.fr/';
-
-$sitekey =
'PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE';
-$sitename = 'Index';
-
-$sitepath = "/USK@$sitekey/$sitename";
-
-$buffer_file = 'local.html';
-
-$bot = new bot();
-
-$bot->getDistantFile($buffer_file, $fcp_host, $fcp_port, $sitepath.'/-1');
-//$bot->buffer_contents = $bot->getFileContents($buffer_file);
-
-//$urls = $bot->extractURLs();
-//$bot->cleanURLs($urls, $sitekey, $sitename);
-
-//print_r($urls);
-//echo $bot->buffer_contents;
-
-
-class bot {
-
- var $buffer_contents;
-
- function getDistantFile ($buffer_file, $fcp_host, $fcp_port,
$sitepath='')
- {
- global $timeout, $wget_dir;
-
- //exec($wget_dir."wget.exe --timeout=$timeout ${fcp}$sitepath
-O $buffer_file");
-
- $fp = fsockopen($fcp_host, $fcp_port, $errno, $errstr,
$timeout);
- if (!$fp) {
- echo "$errstr ($errno)<br />\n";
- }
- else
- {
- $out = "GET $sitepath HTTP/1.1\r\n";
- $out .= "Host: $fcp_host\r\n";
- $out .= "Connection: Close\r\n\r\n";
-
- fwrite($fp, $out);
-
- while (!feof($fp)) {
- echo fgets($fp, 128);
- }
- fclose($fp);
- }
-
- $this->buffer_contents = $this->getFileContents($buffer_file);
- }
-
- function getFileContents ($file)
- {
-
- $handle = fopen($file, 'r') or die('Erreur ? l\'ouverture du
fichier'.$file);
- $contents = fread($handle, filesize ($file));
- fclose($handle);
-
- return $contents;
- }
-
- function getLastEdition ()
- {
-
- }
-
- function extractTitle ()
- {
- if ( preg_match_all('/<title>(.+?)<\/title>/s',
$this->buffer_contents, $title) ) {
- return $title[1][0];
- }
- }
-
- function extractMetas ()
- {
- if (preg_match_all('/<meta(.+?)>/si', $this->buffer_contents,
$matches))
- {
- foreach ($matches[1] as $value) // contenu de chaque
balise meta
- {
- preg_match_all('/ ?(.+?)="(.+?)" ?/si', $value,
$matches2);
- foreach ($matches2[1] as $key => $value) //
chaque cl?e
- {
- if ($value == 'name' || $value ==
'content')
- $buf[ $matches2[1][$key] ] =
$matches2[2][$key];
- }
-
- if ( !empty($buf['name']) &&
!empty($buf['content']) )
- $meta[$buf['name']] = $buf['content'];
-
- unset($buf);
-
- }
- }
-
- return $meta;
-
- }
-
- function extractURLs ()
- {
-
- if ( preg_match_all('/<a href="(.*?)".*>/i',
$this->buffer_contents, $matches) )
- return $matches[1];
-
- }
-
- function cleanURLs (&$urls, $sitekey, $sitename)
- {
- // todo: support des ../
-
- foreach ($urls as $key => $value)
- {
-
- $value = trim($value);
-
- if ( substr($value, 0, 7) == 'http://') // si l'url
commence par http://, on la retire
- {
- $value = '';
- }
- elseif ( substr($value, 0, 1) != '/') // si ce n'est
pas une url absolue alors
- {
- if ( substr($value, 0, 2) == './') // on enl?ve
?ventuellement ./
- $value = substr($value, 2);
-
- // on ajoute $sitepath
- $value = '/'.$sitekey.'/'.$sitename.'/'.$value;
- }
-
- if ( substr($value, -1) == '/') // si l'url fini par un
slash, on le retire
- $value = substr($value, 0, -1);
-
- // On retire les liens vers les diverses versions
- if (
preg_match("#^/[A-Z]{3,3}@$sitekey/$sitename/?-[0-9]+$#i", $value, $matches,
PREG_OFFSET_CAPTURE) )
- $value = '';
-
- // mise ? jour de l'url
- $urls[$key] = $value;
-
- }
- }
-
-
-}
-
-
-/*
-$addresse_complete = "$addresse_fcp" . "$start_page";
-
-exec("c:\wget\wget.exe --timeout=$timeout $addresse_complete -O
c:\serveur\www\freenetbot\local.html");
-
-
-
-
-$fich='local.html';
-$ouvre=fopen($fich,'r');
-$filesize = filesize("local.html");
-
-
-while(!feof($ouvre))
-{
- $ligne=fgets($ouvre,$filesize);
-
- if (eregi("<title>(.*)</title>", $ligne, $titre) == TRUE) {
- //echo $titre[1];
- }
-
- if (eregi("<a(.*)>(.*)</a>", $ligne, $liens) == TRUE) {
- $liens_complet = $liens[0];
- $test = explode("href=",$liens_complet);
- $testa = $test[1];
- $test1 = explode("\"",$testa);
- $testb = $test1[1];
-
- if (eregi("newbookmark",$testb) == TRUE) { }
- elseif (eregi("@",$testb) == TRUE) {
- $cible = "$addresse_fcp" . "$testb";
- echo "externe : $cible<br>";
- }
- else {
- $cible = "$addresse_complete" . "$testb";
- echo "interne : $cible<br>";
- }
- //exit();
- }
- break;
-}
-
-fclose($ouvre);
-*/
-
-?>
\ No newline at end of file
Deleted: trunk/apps/DarknetSpiderBot/config.php
===================================================================
--- trunk/apps/DarknetSpiderBot/config.php 2006-04-11 18:33:13 UTC (rev
8519)
+++ trunk/apps/DarknetSpiderBot/config.php 2006-04-11 19:07:06 UTC (rev
8520)
@@ -1,14 +0,0 @@
-<?php
-
-$timeout = "20";
-$fcp_host = "127.0.0.1";
-$fcp_port = '8888';
-
-$wget_dir = 'c:\\wget\\';
-
-$hostname_bot = "";
-$database_bot = "";
-$username_bot = "";
-$password_bot = "";
-//$bot = mysql_pconnect($hostname_bot, $username_bot, $password_bot) or
trigger_error(mysql_error(),E_USER_ERROR);
-?>
\ No newline at end of file
Added: trunk/apps/DarknetSpiderBot/include/config.inc.php
===================================================================
--- trunk/apps/DarknetSpiderBot/include/config.inc.php 2006-04-11 18:33:13 UTC
(rev 8519)
+++ trunk/apps/DarknetSpiderBot/include/config.inc.php 2006-04-11 19:07:06 UTC
(rev 8520)
@@ -0,0 +1,15 @@
+<?php
+
+$timeout = "20";
+$fcp_host = "127.0.0.1";
+$fcp_port = '8888';
+
+/*
+$hostname_bot = "";
+$database_bot = "";
+$username_bot = "";
+$password_bot = "";
+$bot = mysql_pconnect($hostname_bot, $username_bot, $password_bot) or
trigger_error(mysql_error(),E_USER_ERROR);
+*/
+
+?>
\ No newline at end of file