Author: sich
Date: 2006-04-11 17:01:08 +0000 (Tue, 11 Apr 2006)
New Revision: 8515

Modified:
   trunk/apps/DarknetSpiderBot/bot.php
   trunk/apps/DarknetSpiderBot/config.php
Log:
update the files to the last version

Modified: trunk/apps/DarknetSpiderBot/bot.php
===================================================================
--- trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 16:53:12 UTC (rev 8514)
+++ trunk/apps/DarknetSpiderBot/bot.php 2006-04-11 17:01:08 UTC (rev 8515)
@@ -2,61 +2,20 @@

 require_once('config.php'); 

+//$url = $addresse_fcp.$start_page;
+//$url = 'http://www.lemonde.fr/';
+$sitepath = "/SSK at 
PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE/Index-21/all.html";

-/*
-$addresse_complete = "$addresse_fcp" . "$start_page";
-
-exec("c:\wget\wget.exe --timeout=$timeout $addresse_complete -O 
c:\serveur\www\freenetbot\local.html");
-
-
-
-
-$fich='local.html';
-$ouvre=fopen($fich,'r');
-$filesize = filesize("local.html");
-
-
-while(!feof($ouvre))
-{
-       $ligne=fgets($ouvre,$filesize);
-       
-       if (eregi("<title>(.*)</title>", $ligne, $titre) == TRUE) {
-               //echo $titre[1];
-       }
-       
-       if (eregi("<a(.*)>(.*)</a>", $ligne, $liens) == TRUE) {
-               $liens_complet = $liens[0];
-               $test = explode("href=",$liens_complet);
-               $testa = $test[1];
-               $test1 = explode("\"",$testa);
-               $testb = $test1[1];
-               
-               if (eregi("newbookmark",$testb) == TRUE) { }
-               elseif (eregi("@",$testb) == TRUE) {
-                       $cible = "$addresse_fcp" . "$testb";
-                       echo "externe : $cible<br>";
-               }
-               else { 
-                       $cible = "$addresse_complete" . "$testb";
-                       echo "interne : $cible<br>"; 
-               }
-               //exit();
-       }
-    break;
-}
-
-fclose($ouvre);
-*/
-
-
-$url = $addresse_fcp.$start_page;
-
 $buffer_file = 'local.html';

 $bot = new bot();
-$bot->getDistantFile($url, $buffer_file);
+$bot->getDistantFile($buffer_file, $fcp, $sitepath);
 echo 'title: '.$bot->extractTitle();

+$urls = $bot->extractURLs();
+$bot->reconstructURLs($urls, $sitepath);
+
+print_r($urls);
 //echo $bot->buffer_contents;


@@ -64,12 +23,12 @@

        var $buffer_contents;

-       function getDistantFile ($url, $dest)
+       function getDistantFile ($buffer_file, $fcp, $sitepath='')
        {
                global $timeout, $wget_dir;

-               exec($wget_dir."wget.exe --timeout=$timeout $url -O $dest");
-               $this->buffer_contents = $this->getFileContents($dest);
+               exec($wget_dir."wget.exe --timeout=$timeout ${fcp}$sitepath -O 
$buffer_file");
+               $this->buffer_contents = $this->getFileContents($buffer_file);
        }

        function getFileContents ($file)
@@ -84,65 +43,116 @@

        function extractTitle ()
        {
-               if ( preg_match_all('/<title>(.*?)<\/title>/s', 
$this->buffer_contents, $title) ) {
+               if ( preg_match_all('/<title>(.+?)<\/title>/s', 
$this->buffer_contents, $title) ) {
                        return $title[1][0];
                }
        }
-
-       function extractidentifier_url ()
+       
+       function extractMetas ()
        {
-               if ( preg_match_all('/<META NAME=\"identifier-url\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $identifier_url) ) {
-                       return $identifier_url[1][0];
-               }
-       }
+               if (preg_match_all('/<meta(.+?)>/si', $this->buffer_contents, 
$matches))
+               {
+                       foreach ($matches[1] as $value) // contenu de chaque 
balise meta
+                       {
+                               preg_match_all('/ ?(.+?)="(.+?)" ?/si', $value, 
$matches2);
+                               foreach ($matches2[1] as $key => $value) // 
chaque cl?e
+                               {
+                                       if ($value == 'name' || $value == 
'content')                                    
+                                               $buf[ $matches2[1][$key] ] = 
$matches2[2][$key];
+                               }
+                               
+                               if ( !empty($buf['name']) && 
!empty($buf['content']) )
+                                       $meta[$buf['name']] = $buf['content'];

-       function extractrevisit_after ()
-       {
-               if ( preg_match_all('/<META NAME=\"revisit-after\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $revisit_after) ) {
-                       return $revisit_after[1][0];
-               }
-       }
+                               unset($buf);

-       function extractdescription ()
-       {
-               if ( preg_match_all('/<META NAME=\"description\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $description) ) {
-                       return $description[1][0];
+                       }
                }
+               
+               return $meta;
+               
        }
-
-       function extractkeywords ()
+       
+       function extractURLs ()
        {
-               if ( preg_match_all('/<META NAME=\"keywords\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $keywords) ) {
-                       return $keywords[1][0];
-               }
+                       
+           if ( preg_match_all('/<a href="(.*?)".*>/i', 
$this->buffer_contents, $matches) )  
+               return $matches[1];
+               
        }
-
-       function extractdate_creation ()
+       
+       function reconstructURLs (&$urls, $sitepath)
        {
-               if ( preg_match_all('/<META NAME=\"date-creation-yyyymmdd\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $date_creation) ) {
-                       return $date_creation[1][0];
-               }
-       }

-       function extractdate_revision ()
-       {
-               if ( preg_match_all('/<META NAME=\"date-revision-yyyymmdd\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $date_revision) ) {
-                       return $date_revision[1][0];
+               foreach ($urls as $key => $value)
+               {
+                               
+                       if ( substr($value, 0, 7) == 'http://') // si l'url 
commence par http://, on la retire
+                               $value = '';
+                               
+                       if ( substr($value, -1) == '/') // si l'url fini par un 
slash, on le retire
+                               $value = substr($value, 0, -1);
+
+                       if ( substr($value, 0, 1) != '/') // si ce n'est pas 
une url absolue alors
+                       {
+                               if ( substr($value, 0, 2) == './') // on enl?ve 
?ventuellement ./
+                                       $value = substr($value, 2);
+                               
+                               // on ajoute $sitepath
+                               $value = $sitepath.'/'.$value;
+                       }
+                       
+                       // mise ? jour de l'url
+                       $urls[$key] = $value; 
+               
                }
        }
+}

-       function extractcategory ()
-       {
-               if ( preg_match_all('/<META NAME=\"category\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $category) ) {
-                       return $category[1][0];
-               }
+
+/*
+$addresse_complete = "$addresse_fcp" . "$start_page";
+
+exec("c:\wget\wget.exe --timeout=$timeout $addresse_complete -O 
c:\serveur\www\freenetbot\local.html");
+
+
+
+
+$fich='local.html';
+$ouvre=fopen($fich,'r');
+$filesize = filesize("local.html");
+
+
+while(!feof($ouvre))
+{
+       $ligne=fgets($ouvre,$filesize);
+       
+       if (eregi("<title>(.*)</title>", $ligne, $titre) == TRUE) {
+               //echo $titre[1];
        }
-
-       function extractpublisher ()
-       {
-               if ( preg_match_all('/<META NAME=\"publisher\" 
CONTENT=\"(.*)\">/s/i', $this->buffer_contents, $publisher) ) {
-                       return $publisher[1][0];
+       
+       if (eregi("<a(.*)>(.*)</a>", $ligne, $liens) == TRUE) {
+               $liens_complet = $liens[0];
+               $test = explode("href=",$liens_complet);
+               $testa = $test[1];
+               $test1 = explode("\"",$testa);
+               $testb = $test1[1];
+               
+               if (eregi("newbookmark",$testb) == TRUE) { }
+               elseif (eregi("@",$testb) == TRUE) {
+                       $cible = "$addresse_fcp" . "$testb";
+                       echo "externe : $cible<br>";
                }
+               else { 
+                       $cible = "$addresse_complete" . "$testb";
+                       echo "interne : $cible<br>"; 
+               }
+               //exit();
        }
+    break;
 }
+
+fclose($ouvre);
+*/
+
 ?>
\ No newline at end of file

Modified: trunk/apps/DarknetSpiderBot/config.php
===================================================================
--- trunk/apps/DarknetSpiderBot/config.php      2006-04-11 16:53:12 UTC (rev 
8514)
+++ trunk/apps/DarknetSpiderBot/config.php      2006-04-11 17:01:08 UTC (rev 
8515)
@@ -1,13 +1,13 @@
 <?php
-       $start_page = "/SSK at 
PFeLTa1si2Ml5sDeUy7eDhPso6TPdmw-2gWfQ4Jg02w,3ocfrqgUMVWA2PeorZx40TW0c-FiIOL-TWKQHoDbVdE,AQABAAE/Index-21/";
-       $start_file = "all.html";
-       
-       $timeout = "20"; 
-       $addresse_fcp = "http://127.0.0.1:8888";;
-       
-       $hostname_bot = "10.0.0.1";
-       $database_bot = "freenet";
-       $username_bot = "sich";
-       $password_bot = "19Geneve54";
-       $bot = mysql_pconnect($hostname_bot, $username_bot, $password_bot) or 
trigger_error(mysql_error(),E_USER_ERROR); 
-?>
+
+$timeout = "20"; 
+$fcp = "http://127.0.0.1:8888";;
+
+$wget_dir = 'D:\\Darknet\\bin\\';
+
+$hostname_bot = "";
+$database_bot = "";
+$username_bot = "";
+$password_bot = "";
+//$bot = mysql_pconnect($hostname_bot, $username_bot, $password_bot) or 
trigger_error(mysql_error(),E_USER_ERROR); 
+?>
\ No newline at end of file


Reply via email to