Author: sich
Date: 2006-04-11 17:01:08 +0000 (Tue, 11 Apr 2006)
New Revision: 8515
Modified:
trunk/apps/DarknetSpiderBot/bot.php
trunk/apps/DarknetSpiderBot/config.php
Log:
update the files to the last version
Modified: trunk/apps/DarknetSpiderBot/bot.php
===================================================================
--- trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 16:53:12 UTC (rev 8514)
+++ trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 17:01:08 UTC (rev 8515)
@@ -2,61 +2,20 @@
require_once('config.php');
+//$url = $addresse_fcp.$start_page;
+//$url = 'http://www.lemonde.fr/';
+$sitepath = "/SSK at
PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE/Index-21/all.html";
-/*
-$addresse_complete = "$addresse_fcp" . "$start_page";
-
-exec("c:\wget\wget.exe --timeout=$timeout $addresse_complete -O
c:\serveur\www\freenetbot\local.html");
-
-
-
-
-$fich='local.html';
-$ouvre=fopen($fich,'r');
-$filesize = filesize("local.html");
-
-
-while(!feof($ouvre))
-{
- $ligne=fgets($ouvre,$filesize);
-
- if (eregi("<title>(.*)</title>", $ligne, $titre) == TRUE) {
- //echo $titre[1];
- }
-
- if (eregi("<a(.*)>(.*)</a>", $ligne, $liens) == TRUE) {
- $liens_complet = $liens[0];
- $test = explode("href=",$liens_complet);
- $testa = $test[1];
- $test1 = explode("\"",$testa);
- $testb = $test1[1];
-
- if (eregi("newbookmark",$testb) == TRUE) { }
- elseif (eregi("@",$testb) == TRUE) {
- $cible = "$addresse_fcp" . "$testb";
- echo "externe : $cible<br>";
- }
- else {
- $cible = "$addresse_complete" . "$testb";
- echo "interne : $cible<br>";
- }
- //exit();
- }
- break;
-}
-
-fclose($ouvre);
-*/
-
-
-$url = $addresse_fcp.$start_page;
-
$buffer_file = 'local.html';
$bot = new bot();
-$bot->getDistantFile($url, $buffer_file);
+$bot->getDistantFile($buffer_file, $fcp, $sitepath);
echo 'title: '.$bot->extractTitle();
+$urls = $bot->extractURLs();
+$bot->reconstructURLs($urls, $sitepath);
+
+print_r($urls);
//echo $bot->buffer_contents;
@@ -64,12 +23,12 @@
var $buffer_contents;
- function getDistantFile ($url, $dest)
+ function getDistantFile ($buffer_file, $fcp, $sitepath='')
{
global $timeout, $wget_dir;
- exec($wget_dir."wget.exe --timeout=$timeout $url -O $dest");
- $this->buffer_contents = $this->getFileContents($dest);
+ exec($wget_dir."wget.exe --timeout=$timeout ${fcp}$sitepath -O
$buffer_file");
+ $this->buffer_contents = $this->getFileContents($buffer_file);
}
function getFileContents ($file)
@@ -84,65 +43,116 @@
function extractTitle ()
{
- if ( preg_match_all('/<title>(.*?)<\/title>/s',
$this->buffer_contents, $title) ) {
+ if ( preg_match_all('/<title>(.+?)<\/title>/s',
$this->buffer_contents, $title) ) {
return $title[1][0];
}
}
-
- function extractidentifier_url ()
+
+ function extractMetas ()
{
- if ( preg_match_all('/<META NAME=\"identifier-url\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $identifier_url) ) {
- return $identifier_url[1][0];
- }
- }
+ if (preg_match_all('/<meta(.+?)>/si', $this->buffer_contents,
$matches))
+ {
+ foreach ($matches[1] as $value) // contenu de chaque
balise meta
+ {
+ preg_match_all('/ ?(.+?)="(.+?)" ?/si', $value,
$matches2);
+ foreach ($matches2[1] as $key => $value) //
chaque cl?e
+ {
+ if ($value == 'name' || $value ==
'content')
+ $buf[ $matches2[1][$key] ] =
$matches2[2][$key];
+ }
+
+ if ( !empty($buf['name']) &&
!empty($buf['content']) )
+ $meta[$buf['name']] = $buf['content'];
- function extractrevisit_after ()
- {
- if ( preg_match_all('/<META NAME=\"revisit-after\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $revisit_after) ) {
- return $revisit_after[1][0];
- }
- }
+ unset($buf);
- function extractdescription ()
- {
- if ( preg_match_all('/<META NAME=\"description\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $description) ) {
- return $description[1][0];
+ }
}
+
+ return $meta;
+
}
-
- function extractkeywords ()
+
+ function extractURLs ()
{
- if ( preg_match_all('/<META NAME=\"keywords\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $keywords) ) {
- return $keywords[1][0];
- }
+
+ if ( preg_match_all('/<a href="(.*?)".*>/i',
$this->buffer_contents, $matches) )
+ return $matches[1];
+
}
-
- function extractdate_creation ()
+
+ function reconstructURLs (&$urls, $sitepath)
{
- if ( preg_match_all('/<META NAME=\"date-creation-yyyymmdd\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $date_creation) ) {
- return $date_creation[1][0];
- }
- }
- function extractdate_revision ()
- {
- if ( preg_match_all('/<META NAME=\"date-revision-yyyymmdd\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $date_revision) ) {
- return $date_revision[1][0];
+ foreach ($urls as $key => $value)
+ {
+
+ if ( substr($value, 0, 7) == 'http://') // si l'url
commence par http://, on la retire
+ $value = '';
+
+ if ( substr($value, -1) == '/') // si l'url fini par un
slash, on le retire
+ $value = substr($value, 0, -1);
+
+ if ( substr($value, 0, 1) != '/') // si ce n'est pas
une url absolue alors
+ {
+ if ( substr($value, 0, 2) == './') // on enl?ve
?ventuellement ./
+ $value = substr($value, 2);
+
+ // on ajoute $sitepath
+ $value = $sitepath.'/'.$value;
+ }
+
+ // mise ? jour de l'url
+ $urls[$key] = $value;
+
}
}
+}
- function extractcategory ()
- {
- if ( preg_match_all('/<META NAME=\"category\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $category) ) {
- return $category[1][0];
- }
+
+/*
+$addresse_complete = "$addresse_fcp" . "$start_page";
+
+exec("c:\wget\wget.exe --timeout=$timeout $addresse_complete -O
c:\serveur\www\freenetbot\local.html");
+
+
+
+
+$fich='local.html';
+$ouvre=fopen($fich,'r');
+$filesize = filesize("local.html");
+
+
+while(!feof($ouvre))
+{
+ $ligne=fgets($ouvre,$filesize);
+
+ if (eregi("<title>(.*)</title>", $ligne, $titre) == TRUE) {
+ //echo $titre[1];
}
-
- function extractpublisher ()
- {
- if ( preg_match_all('/<META NAME=\"publisher\"
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $publisher) ) {
- return $publisher[1][0];
+
+ if (eregi("<a(.*)>(.*)</a>", $ligne, $liens) == TRUE) {
+ $liens_complet = $liens[0];
+ $test = explode("href=",$liens_complet);
+ $testa = $test[1];
+ $test1 = explode("\"",$testa);
+ $testb = $test1[1];
+
+ if (eregi("newbookmark",$testb) == TRUE) { }
+ elseif (eregi("@",$testb) == TRUE) {
+ $cible = "$addresse_fcp" . "$testb";
+ echo "externe : $cible<br>";
}
+ else {
+ $cible = "$addresse_complete" . "$testb";
+ echo "interne : $cible<br>";
+ }
+ //exit();
}
+ break;
}
+
+fclose($ouvre);
+*/
+
?>
\ No newline at end of file
Modified: trunk/apps/DarknetSpiderBot/config.php
===================================================================
--- trunk/apps/DarknetSpiderBot/config.php 2006-04-11 16:53:12 UTC (rev
8514)
+++ trunk/apps/DarknetSpiderBot/config.php 2006-04-11 17:01:08 UTC (rev
8515)
@@ -1,13 +1,13 @@
<?php
- $start_page = "/SSK at
PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE/Index-21/";
- $start_file = "all.html";
-
- $timeout = "20";
- $addresse_fcp = "http://127.0.0.1:8888";
-
- $hostname_bot = "10.0.0.1";
- $database_bot = "freenet";
- $username_bot = "sich";
- $password_bot = "19Geneve54";
- $bot = mysql_pconnect($hostname_bot, $username_bot, $password_bot) or
trigger_error(mysql_error(),E_USER_ERROR);
-?>
+
+$timeout = "20";
+$fcp = "http://127.0.0.1:8888";
+
+$wget_dir = 'D:\\Darknet\\bin\\';
+
+$hostname_bot = "";
+$database_bot = "";
+$username_bot = "";
+$password_bot = "";
+//$bot = mysql_pconnect($hostname_bot, $username_bot, $password_bot) or
trigger_error(mysql_error(),E_USER_ERROR);
+?>
\ No newline at end of file