October 2011
October 2011
This competition has now ended
All of the entries that people are happy for me to publish are shown in the posts below with a short video about each one.
The winning entry was submitted by Dylan and is in the first post below this one.
Task:
- To create a simple web crawler to gather information from a website.
- The information does not have to be stored.
- Only look for things that would be useful for a search engine, no marks will be given for pointless info.
Rules:
- All code must be written by you.
- You can use any language.
- No cheating.
Prize:
- Any game or Envato marketplace item.
All of the entries that people are happy for me to publish are shown in the posts below with a short video about each one.
The winning entry was submitted by Dylan and is in the first post below this one.
Task:
- To create a simple web crawler to gather information from a website.
- The information does not have to be stored.
- Only look for things that would be useful for a search engine, no marks will be given for pointless info.
Rules:
- All code must be written by you.
- You can use any language.
- No cheating.
Prize:
- Any game or Envato marketplace item.
Re: October 2011
Entry #14
<?php error_reporting('E_WARNING'); set_time_limit(6000); // Define URLs as an array $url = array( "http://www.betterphp.co.uk/", "http://www.phpacademy.org" ); foreach($url as $url){ $tags = get_meta_tags($url); // Get META tags unset($tags['http-equiv'], $tags['google-site-verification']); //Unset Useless Ones $content = file_get_contents($url); preg_match_all("#<a.*href=\"(.*)\".*>(.*)</a>#U", $content, $matches); unset($matches[0]); //Set LINKS array $x = 0; $count = count($matches[1]); for($x = 0; $x < $count; $x++){ if((preg_match('#https://.*#U', $matches[1][$x]) == "0")&&((preg_match("#http://.*#U", $matches[1][$x])) == "0")){ $tmpUrl = $url."/".$matches[1][$x]; }else if(preg_match('#https://.*#U', $matches[1][$x]) == "0"){ $tmpUrl = $matches[1][$x]; } $links[$tmpUrl]['title'] = htmlentities($matches[2][$x]); $subcontent = file_get_contents($tmpUrl); preg_match_all("#<a.+href=\"(.+)\".+>(.+)</a>#U", $subcontent, $submatches[$x]); unset($submatches[$x][0], $submatches[$x][2]); $subTags = get_meta_tags($tmpUrl); unset($subTags['http-equiv'], $subTags['google-site-verification']); $links[$tmpUrl]['tags'] = $subTags; $links[$tmpUrl]['subUrls'] = $submatches[$x]; } $results[$url] = array( "tags" => $tags, "links" => $links ); unset($links); } echo "<pre>"; print_r($results); echo "</pre>"; ?>
Re: October 2011
Entry #13
<?php /*for displaying certain characters the right way only for you Mr. BetterPHP */ header('Content-Type: text/html; charset=utf-8'); $url = "http://betterphp.co.uk/home.html"; $meta = get_meta_tags("{$url}"); if(empty($meta['title']) === TRUE) { $get = file_get_contents($url); preg_match("/<title>(.+)<\/title>/i", $get, $title); if(empty($title) === FALSE){ $meta['title'] = $title[1]; echo '<pre>', print_r($meta, true), '</pre>'; }else { //$meta['title'] = "No title set, too bad."; echo '<pre>', print_r($meta, true), '</pre>'; } }else { echo '<pre>', print_r($meta, true), '</pre>'; } /** WORKING REGEX. JUST IN CASE I FEEL LIKE CHANGING ANYTHING AND MESS THINGS UP '/<a(?:.*?)href=(["|\'].*?["|\'])(.*?)>(.*?)\<\/a\>/i' **/ preg_match_all('/<a(?:.*?)href=(["|\'].*?["|\'])(.*?)>(.*?)\<\/a\>/i', file_get_contents($url), $results); //remove those pesky quotes around the url $results[1] = str_replace('"', '', $results[1]); foreach ($results[1] as &$result){ if(stristr($result, "http://") === FALSE) { $result = $url ."/". $result; } } /* I made this because I was testing it on a website that included a lot of images and instead of seeing the image or the "broken-image-box" I did it like this. */ foreach ($results[3] as &$result) { if(stristr($result, "<img")) { $result = htmlentities($result); } } echo "<pre>" .print_r($results[1], true). "</pre>"; echo "<pre>" .print_r($results[3], true). "</pre>"; ?>
Re: October 2011
Entry #11
<?php /* DISCLAIMERS: I did all this on my iPod. This explains any odd typing. This code is only meant to work on this specific website. Error reporting was turned OFF because it was displaying errors that I didn't understand and didn't appear to be harming me at all. This code is meant to obtain the page URL, TITLE, DESCRIPTION, and KEYWORDS. I was unable to remove the google code thing. */ error_reporting(0); $url = file_get_contents("http://betterphp.co.uk/home.html"); $data = simplexml_load_string($url); unset($data->head->meta[0]); echo "<strong>Title</strong>: ", $data->head->title, "<br />"; Foreach($data->head->meta as $meta){ echo "<strong>", ucwords($meta['name']), "</strong>: ", $meta['content'], "<br />"; } echo "<strong>URL:</strong> {$_SERVER['SERVER_NAME']}{$_SERVER['PHP_SELF']}"; ?>
Re: October 2011
Entry #10
<?php /* Web crawler - BetterPHP competition */ /** * Copyright 2011 Conrad Kleinespel - http://conradk.com * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <http://www.gnu.org/licenses/>. * **/ /** * INFO THE SCRIPT GATHERS * ----------------------- * 1. X Title of the page * 2. X Meta tags content * 3. X Language of the page if provided * 4. X Links * 5. X Keywords and number of times they are repeated in the page (-> weight ?) * 6. X Links to image files * */ // Show the results as plain text header('Content-type: text/plain; charset=utf-8;'); /** * -------------------------------- PARAMS START -------------------------------- **/ // URL of the page to crawl is set here $url = $_GET['url']; // Do you want to crawl pages that $url links to ? $crawl_all = 0; $ca = $_GET['crawl_all']; $crawl_all = (isset($ca) && !empty($ca)) ? (bool) $ca : $crawl_all; /** * -------------------------------- PARAMS STOP -------------------------------- **/ /** * -------------------------------- ARRAYS START -------------------------------- **/ // The $page_urls array contains all URLs to crawl $page_urls = array(); // The $page_contents array contains the content of the pages that have been crawled $page_contents = array(); // Array all the information about the pages is stored in $info = array(); /** * -------------------------------- ARRAYS STOP -------------------------------- **/ /** * -------------------------------- ERROR HANDLING START -------------------------------- **/ // Array containing all errors that may have occured $errors = array(); // If no URL is specified, stop the script and print out error message if(!isset($url) || empty($url)) { $errors[] = 'No URL to crawl was specified.'; } if(!filter_var($url, FILTER_VALIDATE_URL)) { $errors[] = 'The URL you have specified is not valid.'; } if(!empty($errors)) { echo count($errors); echo (count($errors) > 1) ? " errors have occured.\n" : " error has occured.\n"; foreach($errors as $err_key => $error) { echo "{$err_key} - {$error}\n"; } exit(); } /** * -------------------------------- ERROR HANDLING END -------------------------------- **/ /** * -------------------------------- FUNCTIONS START -------------------------------- **/ // Get the content of a given page with cURL function get_page_content($url) { $ch = curl_init(); curl_setopt($ch,CURLOPT_URL,$url); curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE); curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,5); curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE); $data = curl_exec($ch); curl_close($ch); return $data; } // Get content of the URL passed to the script $page_urls[0] = $url; $page_contents[0] = get_page_content($page_urls[0]); // Functions used to get different kinds of information function get_page_title($page_num) { global $info, $page_contents; preg_match('#<title>(.*)</title>#imsU', $page_contents[$page_num], $matches); $info[$page_num]['title'] = trim($matches[1]); return 0; } function get_xml_language($page_num) { global $info, $page_contents; preg_match('#<html(.*)xml:lang=("|\')(.*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches); $info[$page_num]['language']['xml'] = trim($matches[3]); return 0; } function get_page_language($page_num) { global $info, $page_contents; preg_match('#<html(.*)lang=("|\')(.*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches); $info[$page_num]['language']['page'] = trim($matches[3]); return 0; } function get_base($page_num) { // If a base URL is specified via HTML, use it as a base preg_match('#<head(.*)>(.*)<base(.*)href=("|\')([^\#].*)("|\')(.*)>(.*)</head>#imsU', $page_contents[$page_num], $matches); if(filter_var($matches[5], FILTER_VALIDATE_URL)) $base = $matches[5]; // Else, use the domain name as base else { $parsed_url = parse_url($page_urls[$page_num]); $base = $parsed_url['scheme'] . '://' . $parsed_url['hostname']; } return $base; } function format_abs_url($url, $page_url) { $formatted_url = trim($url); if(substr($formatted_url, 0, 2) == '//') $formatted_url = 'http:' . $formatted_url; if(!filter_var($formatted_url, FILTER_VALIDATE_URL)) { // Is the $url an absolute path ? $is_abs_path = false; if(substr($formatted_url, 0, 1) == '/') $is_abs_path = true; // Parses the URL of the current page $page_url $parsed = parse_url($page_url); // If $url is absolute, remove the slashes from the beginning of the path and add the domain of the current page in front of the path if($is_abs_path === true) { $formatted_url = $parsed['scheme'] . '://' . $parsed['host'] . '/' . ltrim($formatted_url, '/'); } else { $formatted_url = $parsed['scheme'] . '://' . $parsed['host'] . dirname($parsed['path']) . '/' . ltrim($formatted_url, '/'); } } return $formatted_url; } function get_links($page_num) { global $info, $page_contents, $page_urls; // Basic URL filtering preg_match_all('#<a(.*)href=("|\')([^\#\?].*)("|\')(.*)>(.*)</a>#imsU', $page_contents[$page_num], $matches); foreach($matches[3] as $key => $match) { // Get the URL for each link $info[$page_num]['links']['url'][] = format_abs_url($match, $page_urls[$page_num]); } $temp_link_array = array_unique($info[$page_num]['links']['url']); $info[$page_num]['links']['url'] = explode('[#SexyCoding#]', implode('[#SexyCoding#]', $temp_link_array)); unset($temp_link_array); return 0; } function get_images($page_num) { global $info, $page_contents, $page_urls; // Basic img URL filtering preg_match_all('#<img(.*)src=("|\')([^\#\?].*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches); foreach($matches[3] as $key => $match) { // Get the URL for each link $info[$page_num]['images']['url'][] = format_abs_url($match, $page_urls[$page_num]); } $temp_link_array = array_unique($info[$page_num]['images']['url']); $info[$page_num]['images']['url'] = explode('[#SexyCoding#]', implode('[#SexyCoding#]', $temp_link_array)); unset($temp_link_array); return 0; } function get_meta($page_num) { global $info, $page_urls; $info[$page_num]['meta'] = get_meta_tags($page_urls[$page_num]); return 0; } function cmp($a, $b) { if ($a == $b) { return 0; } return ($a < $b) ? -1 : 1; } function get_imp_kw($page_num) { global $info, $page_contents; foreach($info as $pn => $page) { $words = str_word_count($page_contents[$pn], 1); $info[$pn]['kw'] = array(); foreach($words as $word) { // Here, you could check the word against a list of words not to use as keywords: 'the', 'a', HTML tags, etc. if(!in_array($word, $info[$pn]['kw']['words'])) { $info[$pn]['kw']['words'][] = $word; $info[$pn]['kw']['count'][] = 1; } else { $array_key = array_search($word, $info[$pn]['kw']['words']); $info[$pn]['kw']['count'][$array_key]++; } uasort($info[$pn]['kw']['count'], 'cmp'); } } } function get_all_info($page_num) { get_page_title($page_num); get_meta($page_num); get_xml_language($page_num); get_page_language($page_num); get_links($page_num); get_images($page_num); get_imp_kw($page_num); } /** * -------------------------------- FUNCTIONS END -------------------------------- **/ /** * -------------------------------- GET INFO START -------------------------------- **/ // Check if links from page 0 should be crawled as well or not if($crawl_all == 0) { get_all_info(0); print_r($info); exit; } elseif ($crawl_all == 1) { get_links(0); } // Foreach link found on the first page... foreach($info[0]['links']['url'] as $new_url) { // Add the link to the list of links to crawl $page_urls[] = $new_url; } // Unset $page_contents so the first page isn't in it, so that we basically crawl the new URL list unset($page_contents); $page_contents = array(); // Foreach url from $page_urls foreach($page_urls as $page_url_key => $page_url) { // Get the content from that url and put it into the $page_contents array $page_contents[] = get_page_content($page_url); } // Finally, get all information for all crawled pages foreach($page_contents as $con_key => $content) { get_all_info($con_key); } // Echo crawled URLs and the information for each of these URLs undernieth print_r($page_urls); print_r($info); /** * -------------------------------- GET INFO END -------------------------------- **/
Re: October 2011
Entry #8
<html> <head> <title>PHP Web Crawler Recon</title> </head> <body> <style type="text/css"> .results_box { background-color:#FFFEE0; overflow: auto; } .headers { font-size:22px; } .results_format{ border-style:ridge; border-width:5px; } body { background-color:#F9F4E4; } </style> <form action="crawl.php" name="crawler" method="POST"> URL to crawl: <input type="text" name="url" value="http://www.google.com"> <input type="submit" name="crawlSubmit" value="Crawl!"> </form> <?php //Simple PHP reconnaissance tool, written for betterphp.co.uk competition Hope I do well Jacek .{-_-}. //PLEASE Make sure you run this on your 'local' web server for all functionality to work class Recon { public $urlGiven; //User defined var $ipAddressGiven; //Resolved by gethostbyname /********************************* * Url Validation & Errors * * Validates URL and gives * * two kinds of errors warn * * and die, a.k.a fatal. * * * *********************************/ public function validateForm($url, $formSubmitted) { $regex = "@^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)+$@i"; //Restriced URL to match .com"/test" "not allowed" $is_valid_url = preg_match($regex, $url); if($is_valid_url === 1) //If bad URL entered die { $this->errorDie("Please enter a valid URL: Format accepted 'http://www.*.com'"); } } public function errorWarn($error_string) //Used to warn, but not stop operation { echo $error_string . "<br>"; } public function errorDie($die_string) //Fatal warning such as bad URL, stop execution { die($die_string); } /********************************* * grabLinks() * * Grabs links that are pre- * * sent in URL homepage. * * * * * *********************************/ public function grabLinks() { $urlGiven = $this->urlGiven; if(!empty($urlGiven)) { $links = @file_get_contents($urlGiven); //Surpressed but error handled. $regex = "@href[ ]*=[ ]*('|\")([^\"'])*('|\")@"; //Regex searches for <a href> tags preg_match_all($regex, $links, $linksRetrievedArr); //Matches all a href's $links_count = count($linksRetrievedArr[0]); //Count amount matched //Cycle and clean href tags, then output all found echo "<div style='height:250; width:500;' class='results_box'>"; for($i = 0; $i < $links_count; $i++) //Repeat for every link in array { $regex = "@href[ ]*=[ ]*('|\")@"; //Splits on href to get clean URL $cleaned_href = preg_split($regex, $linksRetrievedArr[0][$i]); //Cycle through matches [array][url] $regex = "@['|\"]@"; //Searches for trailing ", left behind on first regex $cleaned_href = preg_split($regex, $cleaned_href[1]); //Split the ", we now have a fully clean path or url echo htmlentities($cleaned_href[0]) . "<br>"; } echo "</div>"; } } /********************************* * grabIP() * * Gets host IP address * * achieved via Reverse-DNS * * gethostbyname() * * * *********************************/ public function grabIP() { //Get remote server IP address $cleanURL = $this->urlGiven; $regex = "@^(?:http://)?([^/]+)@i"; //Mathches http preg_match_all($regex, $cleanURL, $cleanedURL); (array) $cleaned = $cleanedURL[1][0]; $serverip = gethostbyname($cleaned); //URL cleaned to www for gethostbyname (!http://) $this->ipAddressGiven = $serverip; //Storing to property echo "<div style='height:20; width:500;' class='results_box'>"; echo "Server IP Address: " . htmlentities($serverip) . "<br>"; //Output IP resolved echo "</div>"; } /********************************* * grabServers() * * Grabs MX and NS servers * * associated with target * * URL address. * * * *********************************/ public function grabServers() { $clean_URL_server = $this->urlGiven; $regex = "@^(http://)www\.@"; //Strip http://www. $cleaned = preg_split($regex, $clean_URL_server); $cleaned = $cleaned[1]; //This is the cleaned URL after split $servers_found = @dns_get_record($cleaned); //Grab all DNS records $server_count = count($servers_found); echo "<div class='results_box' style='height:100; width:500;'>"; for($i = 0; $i < $server_count; $i++) //Cycle and find different types { if($servers_found[$i]['type'] == "MX") //Check for MX Servers found { (array) $get_server_name = $servers_found[$i]['target'] . "<br>"; echo "MX Server: " . "<i>" . $get_server_name . "</i>" . "<br>"; } } for($i = 0; $i < $server_count; $i++) { if($servers_found[$i]['type'] == "NS") //Check for NS Servers found { (array) $get_server_name = htmlentities($servers_found[$i]['target']) . "<br>"; echo "NS: " . "<i>" . $get_server_name . "</i>" . "</br>"; } } echo "</div>"; } /********************************* * grabHeaders() * * Grabs current HTTP Hea- * * der information, will * * try and force 403 or similar * * * *********************************/ public function grabHeaders() { $url = $this->urlGiven; $ch = curl_init(); //Start cURL curl_setopt($ch, CURLOPT_URL, $url); //Target curl_setopt($ch, CURLOPT_HEADER, true); curl_setopt($ch, CURLOPT_NOBODY, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //Grab returned data curl_setopt($ch, CURLOPT_TIMEOUT, 10); //Time out after 10 seconds $headers = curl_exec($ch); //Execute and store result echo "<div class='results_box' style='height:200; width:500;'>"; htmlentities(print_r($headers)); //Print headers returned safely echo "</div>"; } /********************************* * grabSSH() * * Grabs ssh server version. * * This is grabbed from banner * * so do not trust 100%. * * * *********************************/ //THIS WILL NOT WORK ON A WEB SERVER UNLESS YOU HAVE DEDICATED ACCESS, OR HAVE THE POWER TO OPEN LOCAL PORTS. //THIS WILL WORK ON YOUR LOCAL SYSTEM, ENSURE YOU HAVE THE PORTS NOT BLOCKED. public function grabSSH($url, $port) { $url = str_replace("http://", "", $url); //Clean for fsock $sock = @fsockopen($url, $port, $errno, $errstr, 15); //Open connection to TARGET on PORT with timeout of 15 secs if(!$sock) { $this->errorWarn("Connection to target failed, host could be blocking our attempts"); //Catch failed connection } else { echo "<div class='results_box' style='height:100; width:500;'>"; echo htmlentities(fgetss($sock, 150)); //Safely output returned data. echo "</div>"; fclose($sock); //Close socket } } /********************************* * grabFTP() * * Grabs ftp server version. * * This is grabbed from banner * * so do not trust 100%. * * * *********************************/ //THIS WILL NOT WORK ON A WEB SERVER UNLESS YOU HAVE DEDICATED ACCESS, OR HAVE THE POWER TO OPEN LOCAL PORTS. public function grabFTP($url, $port) { $url = str_replace("http://", "", $url); //Clean for fsock $sock = @fsockopen($url, $port, $errno, $errstr, 15); //Open connection to TARGET on PORT with a timeout of 15 secs if(!$sock) { $this->errorWarn("Connection to target failed, host could be blocking our attempts, or authentication needed!"); //Catch failed connection } else { echo "<div class='results_box' style='height:100; width:500;'>"; echo fgetss($sock, 150); //Safely output returned data. echo "</div>"; fclose($sock); //Close socket } } /********************************* * whois() * * Grabs various domain info * * ONLY GODADDY compatible * * this is to change. * * * *********************************/ public function whois($query, $server) { $query = preg_replace("@^http://www.@", "", $query); define('AE_WHOIS_TIMEOUT', 15); //Connection timeout global $ae_whois_errno, $ae_whois_errstr; //Connecting $f = fsockopen($server, 43, $ae_whois_errno, $ae_whois_errstr, AE_WHOIS_TIMEOUT); if (!$f) return false; // connection failed //Sending query fwrite($f, $query."\r\n"); //Receving response $response = ''; while (!feof($f)) $response .= fgets($f, 1024); //Closing connection fclose($f); echo "<div class='results_box' style='height:200; word-wrap:break-word; width:500'>"; echo $response; echo "</div>"; return $response; } //End of Recon Class } /******************* * * * OUTPUT * * * *******************/ //Check form subbed & URL not blank, if so continue. $formSubmitted = (isset($_POST['crawlSubmit']))? 1 : 0; $urlGiven = (isset($_POST['url']))? htmlentities($_POST['url']) : NULL; if($formSubmitted == 1 && $urlGiven != NULL) { $crawler = new Recon(); /******************* * Validate URL * * & * * Form * *******************/ $crawler->validateForm($urlGiven, $formSubmitted); //Checks for valid URL $crawler->urlGiven = $urlGiven; //If valid then store to class property echo "<div class='results_format'>"; /******************* * Grab Links * * (Crawl) * * * *******************/ echo "<div class='headers'> URL's Grabbed From Site </div> <br>"; $crawler->grabLinks(); //Grabs links echo "<br>"; /******************* * * * Grab IP Addy * * * *******************/ echo "<div class='headers'> IP Address </div> <br>"; $crawler->grabIP(); //Reverse-DNS to generate IP echo "<br>"; /******************* * * * HTTP Headers * * * *******************/ echo "<div class='headers'> Header Info </div> <br>"; $crawler->grabHeaders(); //Grabs header info (403 to be forced soon) echo "<br>"; /******************* * * * MX & NS found * * * *******************/ echo "<div class='headers'> Servers found! </div> <br>"; $crawler->grabServers(); //Find MX and NS server hostnames (IP TO BE INCLUDED SOON) echo "<br>"; /******************* * * * Grab SSH Ver. * * * *******************/ echo "<div class='headers'> SSH Banner Grabbed (Don't rely on this, Nmap is more accurate)</div>"; $crawler->grabSSH($urlGiven, '22'); echo "<br>"; /******************* * * * Grab FTP Ver. * * * *******************/ echo "<div class='headers'> FTP Banner Grabbed (Don't rely on this, Nmap is more accurate)</div>"; $crawler->grabFTP($urlGiven, '21'); echo "<br>"; /******************* * * * Whois Results * * * *******************/ echo "<div class='headers'> Whois Info </div>"; $whois_response = $crawler->whois($urlGiven, 'whois.godaddy.com'); //Searches for URL given in the GoDaddy whois records if(strlen($whois_response) < 100) { $crawler->errorWarn('There was an error contacting the whois database. Only GoDaddy domains available.'); } //Catching the horrible whois server error, and making it neat. echo "</div>"; } //Have fun Jacek ?> </body> </html>
Re: October 2011
Entry #6
<?php function get_html($website) { return file_get_contents($website); } function get_tags($website) { $meta_tags = get_meta_tags($website); $keywords = $meta_tags['keywords']; return $keywords; } function get_desc($website) { $meta_tags = get_meta_tags($website); $desc = $meta_tags['description']; return $desc; } function make_array($url) { $tags = get_tags($url); $desc = get_desc($url); $array['meta']['keywords'] = $tags; $array['meta']['description'] = $desc; $reg = "<a\s[^>]*href\s*=\s*([\"\']??)([^\" >]*?)\\1[^>]*>(.*)<\/a>"; preg_match_all("/$reg/siU", get_html($url), $links, PREG_SET_ORDER); $i = 1; foreach($links as $link) { if(strstr($link[2], "http://") == false) { $link[2] = $url . $link[2]; } $array['links'][$i]['name'] = $link[3]; $array['links'][$i]['url'] = $link[2]; $i++; } return $array; } echo "<pre>", print_r(make_array("http://betterphp.co.uk/")), "</pre>"; ?>
Re: October 2011
Entry #4
<?php //make the function function crawler($url){ //The array(s) $array = array(); $links = array(); $urlname = array(); $urls = array(); $meta = get_meta_tags($url); if(!$meta){ $meta = "Sorry!, no meta-tags are found!"; } $parsedurl = parse_url($url); $h = file_get_contents($url); $dom = new DOMDocument(); @$dom->loadHTML($h); $p = new DOMXPath($dom); $hrefs = $p->evaluate("/html/body//a"); for ($i = 0; $i < $hrefs->length; $i++){ $href = $hrefs->item($i); $u = $href->getAttribute('href'); $n = $href->getAttribute('title'); $urlname['name'] = $n; $urls['url'] = $u; $links[] = array_merge((array)$urlname, (array)$urls); } $array['meta'] = $meta; $array['info'] = $parsedurl; $array['links'] = $links; echo "<pre>"; print_r($array); echo "</pre>"; } //test the function crawler("http://betterphp.co.uk"); ?>
Re: October 2011
Entry #2
<?php error_reporting(E_ALL ^ E_WARNING); if(isset($_GET['url'])) { $dom = new DOMDocument(); if($dom->loadHTMLFile($_GET['url'])) { $html = array(); $html['title'] = $dom->getElementsByTagName('title')->item(0)->nodeValue; $metas = $dom->getElementsByTagName('meta'); foreach($metas AS $meta) { if(in_array($meta->getAttribute('name'), array('keywords', 'description'))) { $html['meta'][$meta->getAttribute('name')] = $meta->getAttribute('content'); } } $anchors = $dom->getElementsByTagName('a'); foreach($anchors AS $anchor) { $html['links'][] = array( 'text' => $anchor->nodeValue, 'href' => $anchor->getAttribute('href'), 'title/alt' => ( ($anchor->getAttribute('title'))? $anchor->getAttribute('title') : $anchor->getAttribute('alt') ) ); } echo '<pre>' , htmlspecialchars(print_r($html,true)) , '</pre>'; } else echo 'could not load file'; } else echo 'usage: ?url=htmlfiletocrawl'; ?>
Re: October 2011
Entry #1
<?php $un_parsed_url = "http://betterphp.co.uk/"; // your url $htmlfile = new DOMDocument; // located in internal php code. $htmlfile->loadHTMLFile($un_parsed_url); // load the html code here (in this case a url). $domxpath = new DOMXPath($htmlfile); // located in internal php code. /* * //li is scaning for <li> tags * //a scans for <a> tags * [@href] after the a is the atribute of the <a> tag. (change href to title or something). */ $tag = $domxpath->query('//li//a[@href]'); // search for all tags that outputed a href attribute. $finaloutput = array(); // make an empty array foreach($tag as $link){ $finalouput[] = array( 'name' => $link->nodeValue, 'url' => $un_parsed_url.$link->getAttribute('href') ); // input url and title to empty array } $arr = array( // get meta tags. meta => get_meta_tags($un_parsed_url), // get links links => $finalouput, ); // echo echo '<pre>'; print_r($arr); echo '</pre>'; ?>