October 2011

Competition announcements will be made in here.

October 2011

Postby jacek » Tue Oct 11, 2011 8:11 pm

This competition has now ended

All of the entries that people are happy for me to publish are shown in the posts below with a short video about each one.

The winning entry was submitted by Dylan and is in the first post below this one.

Task:
- To create a simple web crawler to gather information from a website.
- The information does not have to be stored.
- Only look for things that would be useful for a search engine, no marks will be given for pointless info.

Rules:
- All code must be written by you.
- You can use any language.
- No cheating.

Prize:
- Any game or Envato marketplace item.
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:54 pm

Entry #14



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2. error_reporting('E_WARNING');
  3. set_time_limit(6000);
  4.  
  5. // Define URLs as an array
  6. $url    = array(
  7.    "http://www.betterphp.co.uk/",
  8.    "http://www.phpacademy.org"
  9. );
  10.  
  11. foreach($url as $url){
  12.    $tags    = get_meta_tags($url);  // Get META tags
  13.    unset($tags['http-equiv'], $tags['google-site-verification']); //Unset Useless Ones
  14.  
  15.    $content    = file_get_contents($url);
  16.        preg_match_all("#<a.*href=\"(.*)\".*>(.*)</a>#U", $content, $matches);
  17.        unset($matches[0]);
  18.  
  19.    //Set LINKS array
  20.    $x = 0;
  21.    $count = count($matches[1]);
  22.    for($x = 0; $x < $count; $x++){
  23.        if((preg_match('#https://.*#U', $matches[1][$x]) == "0")&&((preg_match("#http://.*#U", $matches[1][$x])) == "0")){
  24.            $tmpUrl    = $url."/".$matches[1][$x];
  25.        }else if(preg_match('#https://.*#U', $matches[1][$x]) == "0"){
  26.            $tmpUrl    = $matches[1][$x];
  27.        }
  28.        
  29.        $links[$tmpUrl]['title']    = htmlentities($matches[2][$x]);
  30.        
  31.        $subcontent    = file_get_contents($tmpUrl);
  32.            preg_match_all("#<a.+href=\"(.+)\".+>(.+)</a>#U", $subcontent, $submatches[$x]);
  33.            unset($submatches[$x][0], $submatches[$x][2]);
  34.        
  35.        $subTags = get_meta_tags($tmpUrl);
  36.            unset($subTags['http-equiv'], $subTags['google-site-verification']);
  37.            
  38.        $links[$tmpUrl]['tags']        = $subTags;
  39.        $links[$tmpUrl]['subUrls']    = $submatches[$x];
  40.    }
  41.  
  42.    $results[$url]    = array(
  43.        "tags"    => $tags,
  44.        "links"    => $links
  45.    );
  46.    unset($links);
  47. }
  48. echo "<pre>";
  49. print_r($results);
  50. echo "</pre>";
  51. ?>
Parsed in 0.056 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:56 pm

Entry #13



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2.  
  3.        
  4.  
  5.         /*for displaying certain characters the right way
  6.  
  7.         only for you Mr. BetterPHP */
  8.  
  9.         header('Content-Type: text/html; charset=utf-8');
  10.  
  11.        
  12.  
  13.         $url = "http://betterphp.co.uk/home.html";
  14.  
  15.        
  16.  
  17.         $meta = get_meta_tags("{$url}");
  18.  
  19.         if(empty($meta['title']) === TRUE) {
  20.  
  21.                 $get = file_get_contents($url);
  22.  
  23.                 preg_match("/<title>(.+)<\/title>/i", $get, $title);
  24.  
  25.                
  26.  
  27.                 if(empty($title) === FALSE){
  28.  
  29.                         $meta['title'] = $title[1];
  30.  
  31.                         echo '<pre>', print_r($meta, true), '</pre>';
  32.  
  33.                 }else {
  34.  
  35.                         //$meta['title'] = "No title set, too bad.";
  36.  
  37.                         echo '<pre>', print_r($meta, true), '</pre>';
  38.  
  39.                 }
  40.  
  41.  
  42.  
  43.         }else {
  44.  
  45.                 echo '<pre>', print_r($meta, true), '</pre>';
  46.  
  47.         }
  48.  
  49.        
  50.  
  51.         /** WORKING REGEX. JUST IN CASE I FEEL LIKE CHANGING ANYTHING AND MESS THINGS UP
  52.  
  53.         '/<a(?:.*?)href=(["|\'].*?["|\'])(.*?)>(.*?)\<\/a\>/i'
  54.  
  55.         **/
  56.  
  57.        
  58.  
  59.         preg_match_all('/<a(?:.*?)href=(["|\'].*?["|\'])(.*?)>(.*?)\<\/a\>/i', file_get_contents($url), $results);
  60.  
  61.         //remove those pesky quotes around the url
  62.  
  63.         $results[1] = str_replace('"', '', $results[1]);
  64.  
  65.        
  66.  
  67.         foreach ($results[1] as &$result){
  68.  
  69.                 if(stristr($result, "http://") === FALSE) {
  70.  
  71.                         $result = $url ."/". $result;
  72.  
  73.                 }
  74.  
  75.         }
  76.  
  77.        
  78.  
  79.         /* I made this because I was testing it
  80.  
  81.         on a website that included a lot of images
  82.  
  83.         and instead of seeing the image or the "broken-image-box"
  84.  
  85.         I did it like this. */
  86.  
  87.         foreach ($results[3] as &$result) {
  88.  
  89.                 if(stristr($result, "<img")) {
  90.  
  91.                         $result = htmlentities($result);
  92.  
  93.                 }
  94.  
  95.         }
  96.  
  97.  
  98.  
  99.         echo "<pre>" .print_r($results[1], true). "</pre>";
  100.  
  101.         echo "<pre>" .print_r($results[3], true). "</pre>";
  102.  
  103.        
  104.  
  105. ?>
Parsed in 0.046 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:56 pm

Entry #12
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:56 pm

Entry #11



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2. /*
  3. DISCLAIMERS:
  4. I did all this on my iPod. This explains any odd typing.
  5.  
  6. This code is only meant to work on this specific website.
  7.  
  8. Error reporting was turned OFF because it was displaying errors that I didn't understand and didn't appear to be harming me at all.
  9.  
  10. This code is meant to obtain the page URL, TITLE, DESCRIPTION, and KEYWORDS. I was unable to remove the google code thing.
  11. */
  12. error_reporting(0);
  13. $url = file_get_contents("http://betterphp.co.uk/home.html");
  14.  
  15. $data = simplexml_load_string($url);
  16.  
  17. unset($data->head->meta[0]);
  18. echo "<strong>Title</strong>: ", $data->head->title, "<br />";
  19. Foreach($data->head->meta as $meta){
  20.  echo "<strong>", ucwords($meta['name']), "</strong>: ", $meta['content'], "<br />";
  21. }
  22. echo "<strong>URL:</strong> {$_SERVER['SERVER_NAME']}{$_SERVER['PHP_SELF']}";
  23. ?>
Parsed in 0.041 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:56 pm

Entry #10



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2.  
  3.  
  4.  
  5. /* Web crawler - BetterPHP competition */
  6.  
  7.  
  8.  
  9. /**
  10.  
  11.  * Copyright 2011 Conrad Kleinespel - http://conradk.com
  12.  
  13.  *
  14.  
  15.  * This program is free software: you can redistribute it and/or modify
  16.  
  17.  * it under the terms of the GNU General Public License as published by
  18.  
  19.  * the Free Software Foundation, either version 3 of the License, or
  20.  
  21.  * (at your option) any later version.
  22.  
  23.  *
  24.  
  25.  * This program is distributed in the hope that it will be useful,
  26.  
  27.  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  28.  
  29.  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  30.  
  31.  * GNU General Public License for more details.
  32.  
  33.  *
  34.  
  35.  * You should have received a copy of the GNU General Public License
  36.  
  37.  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  38.  
  39.  *
  40.  
  41.  **/
  42.  
  43.  
  44.  
  45. /**
  46.  
  47.  * INFO THE SCRIPT GATHERS
  48.  
  49.  * -----------------------
  50.  
  51.  * 1. X Title of the page
  52.  
  53.  * 2. X Meta tags content
  54.  
  55.  * 3. X Language of the page if provided
  56.  
  57.  * 4. X Links
  58.  
  59.  * 5. X Keywords and number of times they are repeated in the page (-> weight ?)
  60.  
  61.  * 6. X Links to image files
  62.  
  63.  *
  64.  
  65.  */
  66.  
  67.  
  68.  
  69. // Show the results as plain text
  70.  
  71. header('Content-type: text/plain; charset=utf-8;');
  72.  
  73.  
  74.  
  75.  
  76.  
  77.  
  78.  
  79. /**
  80.  
  81.  * -------------------------------- PARAMS START --------------------------------
  82.  
  83.  **/
  84.  
  85.  
  86.  
  87. // URL of the page to crawl is set here
  88.  
  89. $url = $_GET['url'];
  90.  
  91. // Do you want to crawl pages that $url links to ?
  92.  
  93. $crawl_all = 0;
  94.  
  95. $ca = $_GET['crawl_all'];
  96.  
  97. $crawl_all = (isset($ca) && !empty($ca)) ? (bool) $ca : $crawl_all;
  98.  
  99.  
  100.  
  101.  
  102.  
  103. /**
  104.  
  105.  * -------------------------------- PARAMS STOP --------------------------------
  106.  
  107.  **/
  108.  
  109.  
  110.  
  111. /**
  112.  
  113.  * -------------------------------- ARRAYS START --------------------------------
  114.  
  115.  **/
  116.  
  117.  
  118.  
  119. // The $page_urls array contains all URLs to crawl
  120.  
  121. $page_urls = array();
  122.  
  123.  
  124.  
  125. // The $page_contents array contains the content of the pages that have been crawled
  126.  
  127. $page_contents = array();
  128.  
  129.  
  130.  
  131. // Array all the information about the pages is stored in
  132.  
  133. $info = array();
  134.  
  135.  
  136.  
  137. /**
  138.  
  139.  * -------------------------------- ARRAYS STOP --------------------------------
  140.  
  141.  **/
  142.  
  143.  
  144.  
  145. /**
  146.  
  147.  * -------------------------------- ERROR HANDLING START --------------------------------
  148.  
  149.  **/
  150.  
  151.  
  152.  
  153. // Array containing all errors that may have occured
  154.  
  155. $errors = array();
  156.  
  157.  
  158.  
  159. // If no URL is specified, stop the script and print out error message
  160.  
  161. if(!isset($url) || empty($url)) {
  162.  
  163.         $errors[] = 'No URL to crawl was specified.';
  164.  
  165. }
  166.  
  167.  
  168.  
  169. if(!filter_var($url, FILTER_VALIDATE_URL)) {
  170.  
  171.         $errors[] = 'The URL you have specified is not valid.';
  172.  
  173. }
  174.  
  175.  
  176.  
  177. if(!empty($errors)) {
  178.  
  179.         echo count($errors);
  180.  
  181.         echo (count($errors) > 1) ? " errors have occured.\n" : " error has occured.\n";
  182.  
  183.         foreach($errors as $err_key => $error) {
  184.  
  185.                 echo "{$err_key} - {$error}\n";
  186.  
  187.         }
  188.  
  189.         exit();
  190.  
  191. }
  192.  
  193.  
  194.  
  195. /**
  196.  
  197.  * -------------------------------- ERROR HANDLING END --------------------------------
  198.  
  199.  **/
  200.  
  201.  
  202.  
  203. /**
  204.  
  205.  * -------------------------------- FUNCTIONS START --------------------------------
  206.  
  207.  **/
  208.  
  209.  
  210.  
  211. // Get the content of a given page with cURL
  212.  
  213. function get_page_content($url) {
  214.  
  215.         $ch = curl_init();
  216.  
  217.         curl_setopt($ch,CURLOPT_URL,$url);
  218.  
  219.         curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);
  220.  
  221.         curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,5);
  222.  
  223.         curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);
  224.  
  225.         $data = curl_exec($ch);
  226.  
  227.         curl_close($ch);
  228.  
  229.         return $data;
  230.  
  231. }
  232.  
  233.  
  234.  
  235. // Get content of the URL passed to the script
  236.  
  237. $page_urls[0] = $url;
  238.  
  239. $page_contents[0] = get_page_content($page_urls[0]);
  240.  
  241.  
  242.  
  243. // Functions used to get different kinds of information
  244.  
  245. function get_page_title($page_num) {
  246.  
  247.         global $info, $page_contents;
  248.  
  249.         preg_match('#<title>(.*)</title>#imsU', $page_contents[$page_num], $matches);
  250.  
  251.         $info[$page_num]['title'] = trim($matches[1]);
  252.  
  253.         return 0;
  254.  
  255. }
  256.  
  257. function get_xml_language($page_num) {
  258.  
  259.         global $info, $page_contents;
  260.  
  261.         preg_match('#<html(.*)xml:lang=("|\')(.*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches);
  262.  
  263.         $info[$page_num]['language']['xml'] = trim($matches[3]);
  264.  
  265.         return 0;
  266.  
  267. }
  268.  
  269. function get_page_language($page_num) {
  270.  
  271.         global $info, $page_contents;
  272.  
  273.         preg_match('#<html(.*)lang=("|\')(.*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches);
  274.  
  275.         $info[$page_num]['language']['page'] = trim($matches[3]);
  276.  
  277.         return 0;
  278.  
  279. }
  280.  
  281. function get_base($page_num) {
  282.  
  283.         // If a base URL is specified via HTML, use it as a base
  284.  
  285.         preg_match('#<head(.*)>(.*)<base(.*)href=("|\')([^\#].*)("|\')(.*)>(.*)</head>#imsU', $page_contents[$page_num], $matches);
  286.  
  287.         if(filter_var($matches[5], FILTER_VALIDATE_URL))
  288.  
  289.                 $base = $matches[5];
  290.  
  291.         // Else, use the domain name as base
  292.  
  293.         else {
  294.  
  295.                 $parsed_url = parse_url($page_urls[$page_num]);
  296.  
  297.                 $base = $parsed_url['scheme'] . '://' . $parsed_url['hostname'];
  298.  
  299.         }
  300.  
  301.         return $base;
  302.  
  303. }
  304.  
  305. function format_abs_url($url, $page_url) {
  306.  
  307.         $formatted_url = trim($url);
  308.  
  309.         if(substr($formatted_url, 0, 2) == '//')
  310.  
  311.                 $formatted_url = 'http:' . $formatted_url;
  312.  
  313.  
  314.  
  315.         if(!filter_var($formatted_url, FILTER_VALIDATE_URL)) {
  316.  
  317.                 // Is the $url an absolute path ?
  318.  
  319.                 $is_abs_path = false;
  320.  
  321.                 if(substr($formatted_url, 0, 1) == '/')
  322.  
  323.                         $is_abs_path = true;
  324.  
  325.  
  326.  
  327.                 // Parses the URL of the current page $page_url
  328.  
  329.                 $parsed = parse_url($page_url);
  330.  
  331.  
  332.  
  333.                 // If $url is absolute, remove the slashes from the beginning of the path and add the domain of the current page in front of the path
  334.  
  335.                 if($is_abs_path === true) {
  336.  
  337.                         $formatted_url = $parsed['scheme'] . '://' . $parsed['host'] . '/' . ltrim($formatted_url, '/');
  338.  
  339.                 } else {
  340.  
  341.                         $formatted_url = $parsed['scheme'] . '://' . $parsed['host'] . dirname($parsed['path']) . '/' . ltrim($formatted_url, '/');
  342.  
  343.                 }
  344.  
  345.         }
  346.  
  347.         return $formatted_url;
  348.  
  349. }
  350.  
  351. function get_links($page_num) {
  352.  
  353.         global $info, $page_contents, $page_urls;
  354.  
  355.         // Basic URL filtering
  356.  
  357.         preg_match_all('#<a(.*)href=("|\')([^\#\?].*)("|\')(.*)>(.*)</a>#imsU', $page_contents[$page_num], $matches);
  358.  
  359.         foreach($matches[3] as $key => $match) {
  360.  
  361.                 // Get the URL for each link
  362.  
  363.                 $info[$page_num]['links']['url'][] = format_abs_url($match, $page_urls[$page_num]);
  364.  
  365.         }
  366.  
  367.         $temp_link_array = array_unique($info[$page_num]['links']['url']);
  368.  
  369.  
  370.  
  371.         $info[$page_num]['links']['url'] = explode('[#SexyCoding#]', implode('[#SexyCoding#]', $temp_link_array));
  372.  
  373.         unset($temp_link_array);
  374.  
  375.  
  376.  
  377.         return 0;
  378.  
  379. }
  380.  
  381. function get_images($page_num) {
  382.  
  383.         global $info, $page_contents, $page_urls;
  384.  
  385.         // Basic img URL filtering
  386.  
  387.         preg_match_all('#<img(.*)src=("|\')([^\#\?].*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches);
  388.  
  389.         foreach($matches[3] as $key => $match) {
  390.  
  391.                 // Get the URL for each link
  392.  
  393.                 $info[$page_num]['images']['url'][] = format_abs_url($match, $page_urls[$page_num]);
  394.  
  395.         }
  396.  
  397.         $temp_link_array = array_unique($info[$page_num]['images']['url']);
  398.  
  399.  
  400.  
  401.         $info[$page_num]['images']['url'] = explode('[#SexyCoding#]', implode('[#SexyCoding#]', $temp_link_array));
  402.  
  403.         unset($temp_link_array);
  404.  
  405.  
  406.  
  407.         return 0;
  408.  
  409. }
  410.  
  411. function get_meta($page_num) {
  412.  
  413.         global $info, $page_urls;
  414.  
  415.         $info[$page_num]['meta'] = get_meta_tags($page_urls[$page_num]);
  416.  
  417.         return 0;
  418.  
  419. }
  420.  
  421. function cmp($a, $b) {
  422.  
  423.         if ($a == $b) {
  424.  
  425.                 return 0;
  426.  
  427.         }
  428.  
  429.         return ($a < $b) ? -1 : 1;
  430.  
  431. }
  432.  
  433. function get_imp_kw($page_num) {
  434.  
  435.         global $info, $page_contents;
  436.  
  437.         foreach($info as $pn => $page) {
  438.  
  439.                 $words = str_word_count($page_contents[$pn], 1);
  440.  
  441.                 $info[$pn]['kw'] = array();
  442.  
  443.                 foreach($words as $word) {
  444.  
  445.  
  446.  
  447.                         // Here, you could check the word against a list of words not to use as keywords: 'the', 'a', HTML tags, etc.
  448.  
  449.  
  450.  
  451.                                 if(!in_array($word, $info[$pn]['kw']['words'])) {
  452.  
  453.                                         $info[$pn]['kw']['words'][] = $word;
  454.  
  455.                                         $info[$pn]['kw']['count'][] = 1;
  456.  
  457.                                 } else {
  458.  
  459.                                         $array_key = array_search($word, $info[$pn]['kw']['words']);
  460.  
  461.                                         $info[$pn]['kw']['count'][$array_key]++;
  462.  
  463.                                 }
  464.  
  465.                         uasort($info[$pn]['kw']['count'], 'cmp');
  466.  
  467.                 }
  468.  
  469.         }
  470.  
  471. }
  472.  
  473. function get_all_info($page_num) {
  474.  
  475.         get_page_title($page_num);
  476.  
  477.         get_meta($page_num);
  478.  
  479.         get_xml_language($page_num);
  480.  
  481.         get_page_language($page_num);
  482.  
  483.         get_links($page_num);
  484.  
  485.         get_images($page_num);
  486.  
  487.         get_imp_kw($page_num);
  488.  
  489. }
  490.  
  491.  
  492.  
  493. /**
  494.  
  495.  * -------------------------------- FUNCTIONS END --------------------------------
  496.  
  497.  **/
  498.  
  499.  
  500.  
  501. /**
  502.  
  503.  * -------------------------------- GET INFO START --------------------------------
  504.  
  505.  **/
  506.  
  507.  
  508.  
  509. // Check if links from page 0 should be crawled as well or not
  510.  
  511. if($crawl_all == 0) {
  512.  
  513.         get_all_info(0);
  514.  
  515.         print_r($info);
  516.  
  517.         exit;
  518.  
  519. } elseif ($crawl_all == 1) {
  520.  
  521.         get_links(0);
  522.  
  523. }
  524.  
  525.  
  526.  
  527. // Foreach link found on the first page...
  528.  
  529. foreach($info[0]['links']['url'] as $new_url) {
  530.  
  531.         // Add the link to the list of links to crawl
  532.  
  533.         $page_urls[] = $new_url;
  534.  
  535. }
  536.  
  537.  
  538.  
  539. // Unset $page_contents so the first page isn't in it, so that we basically crawl the new URL list
  540.  
  541. unset($page_contents);
  542.  
  543. $page_contents = array();
  544.  
  545.  
  546.  
  547. // Foreach url from $page_urls
  548.  
  549. foreach($page_urls as $page_url_key => $page_url) {
  550.  
  551.         // Get the content from that url and put it into the $page_contents array
  552.  
  553.         $page_contents[] = get_page_content($page_url);
  554.  
  555. }
  556.  
  557.  
  558.  
  559. // Finally, get all information for all crawled pages
  560.  
  561. foreach($page_contents as $con_key => $content) {
  562.  
  563.         get_all_info($con_key);
  564.  
  565. }
  566.  
  567.  
  568.  
  569. // Echo crawled URLs and the information for each of these URLs undernieth
  570.  
  571. print_r($page_urls);
  572.  
  573. print_r($info);
  574.  
  575.  
  576.  
  577. /**
  578.  
  579.  * -------------------------------- GET INFO END --------------------------------
  580.  
  581.  **/
Parsed in 0.076 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:56 pm

Entry #9
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:57 pm

Entry #8



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <html>
  2.  
  3. <head>
  4.  
  5. <title>PHP Web Crawler Recon</title>
  6.  
  7. </head>
  8.  
  9. <body>
  10.  
  11. <style type="text/css">
  12.  
  13. .results_box {
  14.  
  15.         background-color:#FFFEE0;
  16.  
  17.         overflow: auto;
  18.  
  19. }
  20.  
  21.  
  22.  
  23. .headers {
  24.  
  25.         font-size:22px;
  26.  
  27. }
  28.  
  29.  
  30.  
  31. .results_format{
  32.  
  33.         border-style:ridge;
  34.  
  35.         border-width:5px;
  36.  
  37. }
  38.  
  39.  
  40.  
  41. body {
  42.  
  43.         background-color:#F9F4E4;
  44.  
  45. }
  46.  
  47. </style>
  48.  
  49.  
  50.  
  51. <form action="crawl.php" name="crawler" method="POST">
  52.  
  53. URL to crawl: <input type="text" name="url" value="http://www.google.com">
  54.  
  55. <input type="submit" name="crawlSubmit" value="Crawl!">
  56.  
  57. </form>
  58.  
  59.  
  60.  
  61. <?php
  62.  
  63. //Simple PHP reconnaissance tool, written for betterphp.co.uk competition <img src="./images/smilies/icon_e_wink.gif" alt=";)" title="Wink" /> Hope I do well Jacek .{-_-}.
  64.  
  65. //PLEASE Make sure you run this on your 'local' web server for all functionality to work
  66.  
  67.  
  68.  
  69. class Recon {
  70.  
  71.        
  72.  
  73.         public $urlGiven; //User defined
  74.  
  75.         var $ipAddressGiven; //Resolved by gethostbyname
  76.  
  77.        
  78.  
  79.         /*********************************
  80.  
  81.         *     Url Validation & Errors    *
  82.  
  83.         *     Validates URL and gives    *
  84.  
  85.         *     two kinds of errors warn   *
  86.  
  87.         *     and die, a.k.a fatal.      *
  88.  
  89.         *                                *
  90.  
  91.         *********************************/
  92.  
  93.        
  94.  
  95.         public function validateForm($url, $formSubmitted)
  96.  
  97.         {
  98.  
  99.                
  100.  
  101.                 $regex = "@^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)+$@i"; //Restriced URL to match .com"/test"  "not allowed"
  102.  
  103.                 $is_valid_url = preg_match($regex, $url);
  104.  
  105.                
  106.  
  107.                 if($is_valid_url === 1) //If bad URL entered die
  108.  
  109.                 {
  110.  
  111.                         $this->errorDie("Please enter a valid URL: Format accepted 'http://www.*.com'");
  112.  
  113.                 }
  114.  
  115.                
  116.  
  117.         }
  118.  
  119.        
  120.  
  121.         public function errorWarn($error_string) //Used to warn, but not stop operation
  122.  
  123.         {
  124.  
  125.                  echo $error_string . "<br>";
  126.  
  127.         }
  128.  
  129.        
  130.  
  131.         public function errorDie($die_string) //Fatal warning such as bad URL, stop execution
  132.  
  133.         {
  134.  
  135.                 die($die_string);
  136.  
  137.         }
  138.  
  139.        
  140.  
  141.        
  142.  
  143.         /*********************************
  144.  
  145.         *           grabLinks()          *
  146.  
  147.         *   Grabs links that are pre-    *
  148.  
  149.         *   sent in URL homepage.        *
  150.  
  151.         *                                *
  152.  
  153.         *                                *
  154.  
  155.         *********************************/
  156.  
  157.        
  158.  
  159.         public function grabLinks() {
  160.  
  161.                
  162.  
  163.                 $urlGiven = $this->urlGiven;
  164.  
  165.                
  166.  
  167.                 if(!empty($urlGiven))
  168.  
  169.                 {
  170.  
  171.                        
  172.  
  173.                         $links = @file_get_contents($urlGiven); //Surpressed but error handled.
  174.  
  175.                        
  176.  
  177.                         $regex = "@href[ ]*=[ ]*('|\")([^\"'])*('|\")@"; //Regex searches for <a href> tags
  178.  
  179.                        
  180.  
  181.                         preg_match_all($regex, $links, $linksRetrievedArr); //Matches all a href's
  182.  
  183.                         $links_count = count($linksRetrievedArr[0]);  //Count amount matched
  184.  
  185.                        
  186.  
  187.                         //Cycle and clean href tags, then output all found
  188.  
  189.                        
  190.  
  191.                         echo "<div style='height:250; width:500;' class='results_box'>";
  192.  
  193.                        
  194.  
  195.                         for($i = 0; $i < $links_count; $i++) //Repeat for every link in array
  196.  
  197.                         {
  198.  
  199.                        
  200.  
  201.                         $regex = "@href[ ]*=[ ]*('|\")@"; //Splits on href to get clean URL
  202.  
  203.                        
  204.  
  205.                         $cleaned_href = preg_split($regex, $linksRetrievedArr[0][$i]); //Cycle through matches [array][url]
  206.  
  207.                        
  208.  
  209.                         $regex = "@['|\"]@"; //Searches for trailing ", left behind on first regex
  210.  
  211.                        
  212.  
  213.                         $cleaned_href = preg_split($regex, $cleaned_href[1]); //Split the ", we now have a fully clean path or url
  214.  
  215.                        
  216.  
  217.                         echo htmlentities($cleaned_href[0]) . "<br>";
  218.  
  219.                        
  220.  
  221.                         }
  222.  
  223.                         echo "</div>";
  224.  
  225.                
  226.  
  227.                 }
  228.  
  229.                
  230.  
  231.         }
  232.  
  233.        
  234.  
  235.        
  236.  
  237.         /*********************************
  238.  
  239.         *              grabIP()          *
  240.  
  241.         *       Gets host IP address     *
  242.  
  243.         *       achieved via Reverse-DNS *
  244.  
  245.         *       gethostbyname()          *
  246.  
  247.         *                                *
  248.  
  249.         *********************************/
  250.  
  251.                
  252.  
  253.         public function grabIP()
  254.  
  255.         {
  256.  
  257.                 //Get remote server IP address
  258.  
  259.                 $cleanURL = $this->urlGiven;
  260.  
  261.                
  262.  
  263.                 $regex = "@^(?:http://)?([^/]+)@i"; //Mathches http
  264.  
  265.                
  266.  
  267.                 preg_match_all($regex, $cleanURL, $cleanedURL);
  268.  
  269.                 (array) $cleaned = $cleanedURL[1][0];
  270.  
  271.                 $serverip = gethostbyname($cleaned); //URL cleaned to www for gethostbyname (!http://)
  272.  
  273.                
  274.  
  275.                 $this->ipAddressGiven = $serverip; //Storing to property
  276.  
  277.                
  278.  
  279.                 echo "<div style='height:20; width:500;' class='results_box'>";
  280.  
  281.                 echo "Server IP Address: " . htmlentities($serverip) . "<br>"; //Output IP resolved
  282.  
  283.                 echo "</div>";
  284.  
  285.  
  286.  
  287.         }
  288.  
  289.        
  290.  
  291.        
  292.  
  293.         /*********************************
  294.  
  295.         *           grabServers()        *
  296.  
  297.         *       Grabs MX and NS servers  *
  298.  
  299.         *       associated with target   *
  300.  
  301.         *       URL address.             *
  302.  
  303.         *                                *
  304.  
  305.         *********************************/
  306.  
  307.        
  308.  
  309.         public function grabServers()
  310.  
  311.         {
  312.  
  313.                
  314.  
  315.                 $clean_URL_server = $this->urlGiven;
  316.  
  317.                
  318.  
  319.                 $regex = "@^(http://)www\.@"; //Strip http://www.
  320.  
  321.                 $cleaned = preg_split($regex, $clean_URL_server);
  322.  
  323.                 $cleaned = $cleaned[1]; //This is the cleaned URL after split
  324.  
  325.        
  326.  
  327.                 $servers_found = @dns_get_record($cleaned); //Grab all DNS records
  328.  
  329.                
  330.  
  331.                 $server_count = count($servers_found);
  332.  
  333.                 echo "<div class='results_box' style='height:100; width:500;'>";
  334.  
  335.                 for($i = 0; $i < $server_count; $i++) //Cycle and find different types
  336.  
  337.                 {
  338.  
  339.                
  340.  
  341.                         if($servers_found[$i]['type'] == "MX") //Check for MX Servers found
  342.  
  343.                         {
  344.  
  345.                         (array) $get_server_name = $servers_found[$i]['target'] . "<br>";
  346.  
  347.                         echo "MX Server: " . "<i>" . $get_server_name . "</i>" . "<br>";
  348.  
  349.                         }
  350.  
  351.                        
  352.  
  353.        
  354.  
  355.                 }
  356.  
  357.                
  358.  
  359.                 for($i = 0; $i < $server_count; $i++)
  360.  
  361.                 {
  362.  
  363.                        
  364.  
  365.                         if($servers_found[$i]['type'] == "NS") //Check for NS Servers found
  366.  
  367.                         {
  368.  
  369.                                 (array) $get_server_name = htmlentities($servers_found[$i]['target']) . "<br>";
  370.  
  371.                                 echo "NS: " . "<i>" . $get_server_name . "</i>" . "</br>";
  372.  
  373.                         }
  374.  
  375.                        
  376.  
  377.                 }
  378.  
  379.                
  380.  
  381.                 echo "</div>";
  382.  
  383.                
  384.  
  385.                
  386.  
  387.         }
  388.  
  389.        
  390.  
  391.        
  392.  
  393.         /*********************************
  394.  
  395.         *          grabHeaders()         *
  396.  
  397.         *   Grabs current HTTP Hea-      *
  398.  
  399.         *   der information, will        *
  400.  
  401.         *   try and force 403 or similar *
  402.  
  403.         *                                *
  404.  
  405.         *********************************/
  406.  
  407.        
  408.  
  409.         public function grabHeaders()
  410.  
  411.         {
  412.  
  413.                 $url = $this->urlGiven;
  414.  
  415.                 $ch = curl_init(); //Start cURL
  416.  
  417.                
  418.  
  419.                 curl_setopt($ch, CURLOPT_URL, $url); //Target
  420.  
  421.                 curl_setopt($ch, CURLOPT_HEADER, true);
  422.  
  423.                 curl_setopt($ch, CURLOPT_NOBODY, true);
  424.  
  425.                 curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //Grab returned data
  426.  
  427.                 curl_setopt($ch, CURLOPT_TIMEOUT, 10); //Time out after 10 seconds
  428.  
  429.                
  430.  
  431.                 $headers = curl_exec($ch); //Execute and store result
  432.  
  433.                
  434.  
  435.                 echo "<div class='results_box' style='height:200; width:500;'>";
  436.  
  437.                 htmlentities(print_r($headers)); //Print headers returned safely
  438.  
  439.                 echo "</div>";
  440.  
  441.                
  442.  
  443.         }
  444.  
  445.        
  446.  
  447.        
  448.  
  449.         /*********************************
  450.  
  451.         *           grabSSH()            *
  452.  
  453.         *   Grabs ssh server version.    *
  454.  
  455.         *   This is grabbed from banner  *
  456.  
  457.         *   so do not trust 100%.        *
  458.  
  459.         *                                *
  460.  
  461.         *********************************/
  462.  
  463.        
  464.  
  465.         //THIS WILL NOT WORK ON A WEB SERVER UNLESS YOU HAVE DEDICATED ACCESS, OR HAVE THE POWER TO OPEN LOCAL PORTS.
  466.  
  467.         //THIS WILL WORK ON YOUR LOCAL SYSTEM, ENSURE YOU HAVE THE PORTS NOT BLOCKED.
  468.  
  469.        
  470.  
  471.         public function grabSSH($url, $port)
  472.  
  473.         {
  474.  
  475.                 $url = str_replace("http://", "", $url); //Clean for fsock
  476.  
  477.                 $sock = @fsockopen($url, $port, $errno, $errstr, 15); //Open connection to TARGET on PORT with timeout of 15 secs
  478.  
  479.                 if(!$sock)
  480.  
  481.                 {
  482.  
  483.                         $this->errorWarn("Connection to target failed, host could be blocking our attempts"); //Catch failed connection
  484.  
  485.                 }
  486.  
  487.                 else
  488.  
  489.                 {
  490.  
  491.                         echo "<div class='results_box' style='height:100; width:500;'>";
  492.  
  493.                         echo htmlentities(fgetss($sock, 150)); //Safely output returned data.
  494.  
  495.                         echo "</div>";
  496.  
  497.                         fclose($sock); //Close socket
  498.  
  499.                 }
  500.  
  501.                
  502.  
  503.         }
  504.  
  505.        
  506.  
  507.         /*********************************
  508.  
  509.         *            grabFTP()           *
  510.  
  511.         *   Grabs ftp server version.    *
  512.  
  513.         *   This is grabbed from banner  *
  514.  
  515.         *   so do not trust 100%.        *
  516.  
  517.         *                                *
  518.  
  519.         *********************************/
  520.  
  521.        
  522.  
  523.         //THIS WILL NOT WORK ON A WEB SERVER UNLESS YOU HAVE DEDICATED ACCESS, OR HAVE THE POWER TO OPEN LOCAL PORTS.
  524.  
  525.        
  526.  
  527.         public function grabFTP($url, $port)
  528.  
  529.         {
  530.  
  531.                 $url = str_replace("http://", "", $url); //Clean for fsock
  532.  
  533.                 $sock = @fsockopen($url, $port, $errno, $errstr, 15); //Open connection to TARGET on PORT with a timeout of 15 secs
  534.  
  535.                 if(!$sock)
  536.  
  537.                 {
  538.  
  539.                         $this->errorWarn("Connection to target failed, host could be blocking our attempts, or authentication needed!"); //Catch failed connection
  540.  
  541.                 }
  542.  
  543.                 else
  544.  
  545.                 {
  546.  
  547.                         echo "<div class='results_box' style='height:100; width:500;'>";
  548.  
  549.                         echo fgetss($sock, 150); //Safely output returned data.
  550.  
  551.                         echo "</div>";
  552.  
  553.                         fclose($sock); //Close socket
  554.  
  555.                 }
  556.  
  557.         }
  558.  
  559.        
  560.  
  561.        
  562.  
  563.         /*********************************
  564.  
  565.         *             whois()            *
  566.  
  567.         *   Grabs various domain info    *
  568.  
  569.         *   ONLY GODADDY compatible      *
  570.  
  571.         *   this is to change.           *
  572.  
  573.         *                                *
  574.  
  575.         *********************************/
  576.  
  577.        
  578.  
  579.         public function whois($query, $server)
  580.  
  581.         {
  582.  
  583.  
  584.  
  585.                 $query = preg_replace("@^http://www.@", "", $query);
  586.  
  587.  
  588.  
  589.             define('AE_WHOIS_TIMEOUT', 15); //Connection timeout
  590.  
  591.             global $ae_whois_errno, $ae_whois_errstr;
  592.  
  593.  
  594.  
  595.             //Connecting
  596.  
  597.             $f = fsockopen($server, 43, $ae_whois_errno, $ae_whois_errstr, AE_WHOIS_TIMEOUT);
  598.  
  599.             if (!$f)
  600.  
  601.                 return false; // connection failed
  602.  
  603.  
  604.  
  605.             //Sending query    
  606.  
  607.             fwrite($f, $query."\r\n");
  608.  
  609.  
  610.  
  611.             //Receving response
  612.  
  613.             $response = '';
  614.  
  615.             while (!feof($f))
  616.  
  617.                 $response .= fgets($f, 1024);
  618.  
  619.  
  620.  
  621.             //Closing connection
  622.  
  623.             fclose($f);
  624.  
  625.                 echo "<div class='results_box' style='height:200; word-wrap:break-word; width:500'>";
  626.  
  627.             echo $response;
  628.  
  629.                 echo "</div>";
  630.  
  631.                 return $response;
  632.  
  633.         }
  634.  
  635.        
  636.  
  637.         //End of Recon Class
  638.  
  639. }
  640.  
  641.  
  642.  
  643. /*******************
  644.  
  645. *                  *   
  646.  
  647. *      OUTPUT      *
  648.  
  649. *                  *
  650.  
  651. *******************/
  652.  
  653.  
  654.  
  655. //Check form subbed & URL not blank, if so continue.
  656.  
  657. $formSubmitted = (isset($_POST['crawlSubmit']))? 1 : 0;
  658.  
  659. $urlGiven = (isset($_POST['url']))? htmlentities($_POST['url']) : NULL;
  660.  
  661.  
  662.  
  663. if($formSubmitted == 1 && $urlGiven != NULL)
  664.  
  665. {
  666.  
  667. $crawler = new Recon();
  668.  
  669.  
  670.  
  671.  
  672.  
  673. /*******************
  674.  
  675. *   Validate URL   *   
  676.  
  677. *        &         *
  678.  
  679. *      Form        *
  680.  
  681. *******************/
  682.  
  683. $crawler->validateForm($urlGiven, $formSubmitted); //Checks for valid URL
  684.  
  685. $crawler->urlGiven = $urlGiven; //If valid then store to class property
  686.  
  687.  
  688.  
  689.  
  690.  
  691. echo "<div class='results_format'>";
  692.  
  693. /*******************
  694.  
  695. *    Grab Links    *   
  696.  
  697. *     (Crawl)      *  
  698.  
  699. *                  *
  700.  
  701. *******************/
  702.  
  703.  
  704.  
  705. echo "<div class='headers'> URL's Grabbed From Site </div> <br>";
  706.  
  707. $crawler->grabLinks(); //Grabs links
  708.  
  709. echo "<br>";
  710.  
  711.  
  712.  
  713. /*******************
  714.  
  715. *                  *   
  716.  
  717. *   Grab IP Addy   *
  718.  
  719. *                  *
  720.  
  721. *******************/
  722.  
  723.  
  724.  
  725. echo "<div class='headers'> IP Address </div> <br>";
  726.  
  727. $crawler->grabIP(); //Reverse-DNS to generate IP
  728.  
  729. echo "<br>";
  730.  
  731.  
  732.  
  733. /*******************
  734.  
  735. *                  *   
  736.  
  737. *   HTTP Headers   *
  738.  
  739. *                  *
  740.  
  741. *******************/
  742.  
  743.  
  744.  
  745. echo "<div class='headers'> Header Info </div> <br>";
  746.  
  747. $crawler->grabHeaders(); //Grabs header info (403 to be forced soon)
  748.  
  749. echo "<br>";
  750.  
  751.  
  752.  
  753. /*******************
  754.  
  755. *                  *   
  756.  
  757. *   MX & NS found  *
  758.  
  759. *                  *
  760.  
  761. *******************/
  762.  
  763.  
  764.  
  765. echo "<div class='headers'> Servers found! </div> <br>";
  766.  
  767. $crawler->grabServers(); //Find MX and NS server hostnames (IP TO BE INCLUDED SOON)
  768.  
  769. echo "<br>";
  770.  
  771.  
  772.  
  773. /*******************
  774.  
  775. *                  *   
  776.  
  777. *   Grab SSH Ver.  *
  778.  
  779. *                  *
  780.  
  781. *******************/
  782.  
  783.  
  784.  
  785. echo "<div class='headers'> SSH Banner Grabbed (Don't rely on this, Nmap is more accurate)</div>";
  786.  
  787. $crawler->grabSSH($urlGiven, '22');
  788.  
  789. echo "<br>";
  790.  
  791.  
  792.  
  793.  
  794.  
  795. /*******************
  796.  
  797. *                  *   
  798.  
  799. *   Grab FTP Ver.  *
  800.  
  801. *                  *
  802.  
  803. *******************/
  804.  
  805.  
  806.  
  807. echo "<div class='headers'> FTP Banner Grabbed (Don't rely on this, Nmap is more accurate)</div>";
  808.  
  809. $crawler->grabFTP($urlGiven, '21');
  810.  
  811. echo "<br>";
  812.  
  813.  
  814.  
  815. /*******************
  816.  
  817. *                  *   
  818.  
  819. *   Whois Results  *
  820.  
  821. *                  *
  822.  
  823. *******************/
  824.  
  825.  
  826.  
  827. echo "<div class='headers'> Whois Info </div>";
  828.  
  829.  
  830.  
  831. $whois_response = $crawler->whois($urlGiven, 'whois.godaddy.com'); //Searches for URL given in the GoDaddy whois records
  832.  
  833.  
  834.  
  835. if(strlen($whois_response) < 100)
  836.  
  837.         {
  838.  
  839.                 $crawler->errorWarn('There was an error contacting the whois database. Only GoDaddy domains available.');
  840.  
  841.         }
  842.  
  843.         //Catching the horrible whois server error, and making it neat.
  844.  
  845.  
  846.  
  847. echo "</div>";
  848.  
  849. }
  850.  
  851.  
  852.  
  853. //Have fun Jacek <img src="./images/smilies/icon_e_smile.gif" alt=":)" title="Smile" />
  854.  
  855.  
  856.  
  857. ?>
  858.  
  859.  
  860.  
  861. </body>
  862.  
  863. </html>
Parsed in 0.083 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:57 pm

Entry #6



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2.  
  3.         function get_html($website) {
  4.  
  5.                 return file_get_contents($website);
  6.  
  7.         }
  8.  
  9.        
  10.  
  11.         function get_tags($website) {
  12.  
  13.                 $meta_tags = get_meta_tags($website);          
  14.  
  15.                 $keywords = $meta_tags['keywords'];
  16.  
  17.                 return $keywords;
  18.  
  19.         }
  20.  
  21.        
  22.  
  23.         function get_desc($website) {
  24.  
  25.                 $meta_tags = get_meta_tags($website);          
  26.  
  27.                 $desc = $meta_tags['description'];
  28.  
  29.                 return $desc;
  30.  
  31.         }
  32.  
  33.        
  34.  
  35.         function make_array($url) {
  36.  
  37.                 $tags = get_tags($url);
  38.  
  39.                 $desc = get_desc($url);        
  40.  
  41.                 $array['meta']['keywords'] = $tags;
  42.  
  43.                 $array['meta']['description'] = $desc; 
  44.  
  45.                 $reg = "<a\s[^>]*href\s*=\s*([\"\']??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";  
  46.  
  47.                 preg_match_all("/$reg/siU", get_html($url), $links, PREG_SET_ORDER);   
  48.  
  49.                 $i = 1;
  50.  
  51.                 foreach($links as $link) {
  52.  
  53.                         if(strstr($link[2], "http://") == false) {
  54.  
  55.                                 $link[2] = $url . $link[2];
  56.  
  57.                         }
  58.  
  59.                         $array['links'][$i]['name'] = $link[3];
  60.  
  61.                         $array['links'][$i]['url'] = $link[2];
  62.  
  63.                         $i++;
  64.  
  65.                 }      
  66.  
  67.                 return $array;
  68.  
  69.         }
  70.  
  71.         echo "<pre>", print_r(make_array("http://betterphp.co.uk/")), "</pre>";
  72.  
  73. ?>
Parsed in 0.044 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:58 pm

Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:58 pm

Entry #4



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2.  
  3.    //make the function
  4.    function crawler($url){
  5.        //The array(s)
  6.        $array   = array();
  7.        $links   = array();
  8.        $urlname = array();
  9.        $urls     = array();
  10.        
  11.        $meta        = get_meta_tags($url);
  12.        if(!$meta){
  13.            $meta = "Sorry!, no meta-tags are found!";
  14.        }
  15.        $parsedurl    = parse_url($url);
  16.        $h            = file_get_contents($url);
  17.        $dom        = new DOMDocument();
  18.        @$dom->loadHTML($h);
  19.        
  20.        $p            = new DOMXPath($dom);
  21.        $hrefs        = $p->evaluate("/html/body//a");
  22.        
  23.        for ($i = 0; $i < $hrefs->length; $i++){
  24.            $href            = $hrefs->item($i);
  25.            $u            = $href->getAttribute('href');
  26.            $n            = $href->getAttribute('title');
  27.            $urlname['name']    = $n;            
  28.            $urls['url']        = $u;
  29.            $links[]        = array_merge((array)$urlname, (array)$urls);
  30.        }
  31.        
  32.        $array['meta']            = $meta;
  33.        $array['info']            = $parsedurl;
  34.        $array['links']            = $links;
  35.        
  36.        echo "<pre>";
  37.            print_r($array);
  38.        echo "</pre>";
  39.        
  40.        
  41.    }
  42.    
  43.    //test the function
  44.    crawler("http://betterphp.co.uk");
  45. ?>
Parsed in 0.045 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:58 pm

Entry #2



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2.  
  3. error_reporting(E_ALL ^ E_WARNING);
  4.  
  5.  
  6.  
  7. if(isset($_GET['url'])) {
  8.  
  9.     $dom = new DOMDocument();
  10.  
  11.     if($dom->loadHTMLFile($_GET['url'])) {
  12.  
  13.         $html = array();
  14.  
  15.  
  16.  
  17.         $html['title'] = $dom->getElementsByTagName('title')->item(0)->nodeValue;
  18.  
  19.  
  20.  
  21.         $metas = $dom->getElementsByTagName('meta');
  22.  
  23.         foreach($metas AS $meta) {
  24.  
  25.             if(in_array($meta->getAttribute('name'), array('keywords', 'description'))) {
  26.  
  27.                 $html['meta'][$meta->getAttribute('name')] = $meta->getAttribute('content');
  28.  
  29.             }
  30.  
  31.         }
  32.  
  33.  
  34.  
  35.         $anchors = $dom->getElementsByTagName('a');
  36.  
  37.         foreach($anchors AS $anchor) {
  38.  
  39.             $html['links'][] = array(
  40.  
  41.                 'text' =>       $anchor->nodeValue,
  42.  
  43.                 'href' =>       $anchor->getAttribute('href'),
  44.  
  45.                 'title/alt' =>  ( ($anchor->getAttribute('title'))? $anchor->getAttribute('title') : $anchor->getAttribute('alt') )
  46.  
  47.             );
  48.  
  49.         }
  50.  
  51.         echo '<pre>' , htmlspecialchars(print_r($html,true)) , '</pre>';
  52.  
  53.     }
  54.  
  55.     else
  56.  
  57.         echo 'could not load file';
  58.  
  59. } else
  60.  
  61.     echo 'usage: ?url=htmlfiletocrawl';
  62.  
  63.  
  64.  
  65. ?>
Parsed in 0.046 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK

Re: October 2011

Postby jacek » Mon Oct 31, 2011 11:58 pm

Entry #1



Syntax: [ Download ] [ Hide ]
Using PHP Syntax Highlighting
  1. <?php
  2.  
  3.         $un_parsed_url = "http://betterphp.co.uk/"; // your url
  4.  
  5.        
  6.  
  7.         $htmlfile = new DOMDocument; // located in internal php code.
  8.  
  9.         $htmlfile->loadHTMLFile($un_parsed_url); // load the html code here (in this case a url).
  10.  
  11.        
  12.  
  13.         $domxpath = new DOMXPath($htmlfile); // located in internal php code.
  14.  
  15.         /*
  16.  
  17.         * //li is scaning for <li> tags
  18.  
  19.         * //a scans for <a> tags
  20.  
  21.         * [@href] after the a is the atribute of the <a> tag. (change href to title or something).
  22.  
  23.         */
  24.  
  25.         $tag = $domxpath->query('//li//a[@href]'); // search for all tags that outputed a href attribute.
  26.  
  27.        
  28.  
  29.         $finaloutput = array(); // make an empty array
  30.  
  31.         foreach($tag as $link){
  32.  
  33.                 $finalouput[] = array(
  34.  
  35.                         'name'  => $link->nodeValue,
  36.  
  37.                         'url'   => $un_parsed_url.$link->getAttribute('href')
  38.  
  39.                 ); // input url and title to empty array
  40.  
  41.         }
  42.  
  43.        
  44.  
  45.         $arr = array(
  46.  
  47.                 // get meta tags.
  48.  
  49.                 meta    => get_meta_tags($un_parsed_url),
  50.  
  51.                 // get links
  52.  
  53.                 links   => $finalouput,
  54.  
  55.         );
  56.  
  57.        
  58.  
  59.         // echo
  60.  
  61.         echo '<pre>';
  62.  
  63.                 print_r($arr);
  64.  
  65.         echo '</pre>';
  66.  
  67. ?>
Parsed in 0.043 seconds, using GeSHi 1.0.8.10
Image
User avatar
jacek
Site Admin
 
Posts: 3203
Joined: Thu May 05, 2011 1:45 pm
Location: UK


Return to Competitions



Who is online

Users browsing this forum: No registered users and 1 guest