October 2011

Post by **jacek** » Tue Oct 11, 2011 8:11 pm

This competition has now ended

All of the entries that people are happy for me to publish are shown in the posts below with a short video about each one.

The winning entry was submitted by Dylan and is in the first post below this one.

Task:
- To create a simple web crawler to gather information from a website.
- The information does not have to be stored.
- Only look for things that would be useful for a search engine, no marks will be given for pointless info.

Rules:
- All code must be written by you.
- You can use any language.
- No cheating.

Prize:
- Any game or Envato marketplace item.

Post by **jacek** » Mon Oct 31, 2011 11:54 pm

Entry #14

[syntax=php]<?php
error_reporting('E_WARNING');
set_time_limit(6000);

// Define URLs as an array
$url = array(
"http://www.betterphp.co.uk/",
"http://www.phpacademy.org"
);

foreach($url as $url){
$tags = get_meta_tags($url); // Get META tags
unset($tags['http-equiv'], $tags['google-site-verification']); //Unset Useless Ones

$content = file_get_contents($url);
preg_match_all("#<a.*href=\"(.*)\".*>(.*)</a>#U", $content, $matches);
unset($matches[0]);

//Set LINKS array
$x = 0;
$count = count($matches[1]);
for($x = 0; $x < $count; $x++){
if((preg_match('#https://.*#U', $matches[1][$x]) == "0")&&((preg_match("#http://.*#U", $matches[1][$x])) == "0")){
$tmpUrl = $url."/".$matches[1][$x];
}else if(preg_match('#https://.*#U', $matches[1][$x]) == "0"){
$tmpUrl = $matches[1][$x];
}

$links[$tmpUrl]['title'] = htmlentities($matches[2][$x]);

$subcontent = file_get_contents($tmpUrl);
preg_match_all("#<a.+href=\"(.+)\".+>(.+)</a>#U", $subcontent, $submatches[$x]);
unset($submatches[$x][0], $submatches[$x][2]);

$subTags = get_meta_tags($tmpUrl);
unset($subTags['http-equiv'], $subTags['google-site-verification']);

$links[$tmpUrl]['tags'] = $subTags;
$links[$tmpUrl]['subUrls'] = $submatches[$x];
}

$results[$url] = array(
"tags" => $tags,
"links" => $links
);
unset($links);
}
echo "<pre>";
print_r($results);
echo "</pre>";
?>[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:56 pm

Entry #13

[syntax=php]<?php

/*for displaying certain characters the right way

only for you Mr. BetterPHP */

header('Content-Type: text/html; charset=utf-8');

$url = "http://betterphp.co.uk/home.html";

$meta = get_meta_tags("{$url}");

if(empty($meta['title']) === TRUE) {

$get = file_get_contents($url);

preg_match("/<title>(.+)<\/title>/i", $get, $title);

if(empty($title) === FALSE){

$meta['title'] = $title[1];

echo '<pre>', print_r($meta, true), '</pre>';

}else {

//$meta['title'] = "No title set, too bad.";

echo '<pre>', print_r($meta, true), '</pre>';

}

}else {

echo '<pre>', print_r($meta, true), '</pre>';

}

/** WORKING REGEX. JUST IN CASE I FEEL LIKE CHANGING ANYTHING AND MESS THINGS UP

'/<a(?:.*?)href=(["|\'].*?["|\'])(.*?)>(.*?)\<\/a\>/i'

**/

preg_match_all('/<a(?:.*?)href=(["|\'].*?["|\'])(.*?)>(.*?)\<\/a\>/i', file_get_contents($url), $results);

//remove those pesky quotes around the url

$results[1] = str_replace('"', '', $results[1]);

foreach ($results[1] as &$result){

if(stristr($result, "http://") === FALSE) {

$result = $url ."/". $result;

}

}

/* I made this because I was testing it

on a website that included a lot of images

and instead of seeing the image or the "broken-image-box"

I did it like this. */

foreach ($results[3] as &$result) {

if(stristr($result, "<img")) {

$result = htmlentities($result);

}

}

echo "<pre>" .print_r($results[1], true). "</pre>";

echo "<pre>" .print_r($results[3], true). "</pre>";

?>[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:56 pm

Entry #12

Post by **jacek** » Mon Oct 31, 2011 11:56 pm

Entry #11

[syntax=php]<?php
/*
DISCLAIMERS:
I did all this on my iPod. This explains any odd typing.

This code is only meant to work on this specific website.

Error reporting was turned OFF because it was displaying errors that I didn't understand and didn't appear to be harming me at all.

This code is meant to obtain the page URL, TITLE, DESCRIPTION, and KEYWORDS. I was unable to remove the google code thing.
*/
error_reporting(0);
$url = file_get_contents("http://betterphp.co.uk/home.html");

$data = simplexml_load_string($url);

unset($data->head->meta[0]);
echo "<strong>Title</strong>: ", $data->head->title, "<br />";
Foreach($data->head->meta as $meta){
echo "<strong>", ucwords($meta['name']), "</strong>: ", $meta['content'], "<br />";
}
echo "<strong>URL:</strong> {$_SERVER['SERVER_NAME']}{$_SERVER['PHP_SELF']}";
?>[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:56 pm

Entry #10

[syntax=php]<?php

/* Web crawler - BetterPHP competition */

/**

* Copyright 2011 Conrad Kleinespel - http://conradk.com

*

* This program is free software: you can redistribute it and/or modify

* it under the terms of the GNU General Public License as published by

* the Free Software Foundation, either version 3 of the License, or

* (at your option) any later version.

*

* This program is distributed in the hope that it will be useful,

* but WITHOUT ANY WARRANTY; without even the implied warranty of

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

* GNU General Public License for more details.

*

* You should have received a copy of the GNU General Public License

* along with this program. If not, see <http://www.gnu.org/licenses/>.

*

**/

/**

* INFO THE SCRIPT GATHERS

* -----------------------

* 1. X Title of the page

* 2. X Meta tags content

* 3. X Language of the page if provided

* 4. X Links

* 5. X Keywords and number of times they are repeated in the page (-> weight ?)

* 6. X Links to image files

*

*/

// Show the results as plain text

header('Content-type: text/plain; charset=utf-8;');

/**

* -------------------------------- PARAMS START --------------------------------

**/

// URL of the page to crawl is set here

$url = $_GET['url'];

// Do you want to crawl pages that $url links to ?

$crawl_all = 0;

$ca = $_GET['crawl_all'];

$crawl_all = (isset($ca) && !empty($ca)) ? (bool) $ca : $crawl_all;

/**

* -------------------------------- PARAMS STOP --------------------------------

**/

/**

* -------------------------------- ARRAYS START --------------------------------

**/

// The $page_urls array contains all URLs to crawl

$page_urls = array();

// The $page_contents array contains the content of the pages that have been crawled

$page_contents = array();

// Array all the information about the pages is stored in

$info = array();

/**

* -------------------------------- ARRAYS STOP --------------------------------

**/

/**

* -------------------------------- ERROR HANDLING START --------------------------------

**/

// Array containing all errors that may have occured

$errors = array();

// If no URL is specified, stop the script and print out error message

if(!isset($url) || empty($url)) {

$errors[] = 'No URL to crawl was specified.';

}

if(!filter_var($url, FILTER_VALIDATE_URL)) {

$errors[] = 'The URL you have specified is not valid.';

}

if(!empty($errors)) {

echo count($errors);

echo (count($errors) > 1) ? " errors have occured.\n" : " error has occured.\n";

foreach($errors as $err_key => $error) {

echo "{$err_key} - {$error}\n";

}

exit();

}

/**

* -------------------------------- ERROR HANDLING END --------------------------------

**/

/**

* -------------------------------- FUNCTIONS START --------------------------------

**/

// Get the content of a given page with cURL

function get_page_content($url) {

$ch = curl_init();

curl_setopt($ch,CURLOPT_URL,$url);

curl_setopt($ch,CURLOPT_RETURNTRANSFER,TRUE);

curl_setopt($ch,CURLOPT_CONNECTTIMEOUT,5);

curl_setopt($ch,CURLOPT_FOLLOWLOCATION,TRUE);

$data = curl_exec($ch);

curl_close($ch);

return $data;

}

// Get content of the URL passed to the script

$page_urls[0] = $url;

$page_contents[0] = get_page_content($page_urls[0]);

// Functions used to get different kinds of information

function get_page_title($page_num) {

global $info, $page_contents;

preg_match('#<title>(.*)</title>#imsU', $page_contents[$page_num], $matches);

$info[$page_num]['title'] = trim($matches[1]);

return 0;

}

function get_xml_language($page_num) {

global $info, $page_contents;

preg_match('#<html(.*)xml:lang=("|\')(.*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches);

$info[$page_num]['language']['xml'] = trim($matches[3]);

return 0;

}

function get_page_language($page_num) {

global $info, $page_contents;

preg_match('#<html(.*)lang=("|\')(.*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches);

$info[$page_num]['language']['page'] = trim($matches[3]);

return 0;

}

function get_base($page_num) {

// If a base URL is specified via HTML, use it as a base

preg_match('#<head(.*)>(.*)<base(.*)href=("|\')([^\#].*)("|\')(.*)>(.*)</head>#imsU', $page_contents[$page_num], $matches);

if(filter_var($matches[5], FILTER_VALIDATE_URL))

$base = $matches[5];

// Else, use the domain name as base

else {

$parsed_url = parse_url($page_urls[$page_num]);

$base = $parsed_url['scheme'] . '://' . $parsed_url['hostname'];

}

return $base;

}

function format_abs_url($url, $page_url) {

$formatted_url = trim($url);

if(substr($formatted_url, 0, 2) == '//')

$formatted_url = 'http:' . $formatted_url;

if(!filter_var($formatted_url, FILTER_VALIDATE_URL)) {

// Is the $url an absolute path ?

$is_abs_path = false;

if(substr($formatted_url, 0, 1) == '/')

$is_abs_path = true;

// Parses the URL of the current page $page_url

$parsed = parse_url($page_url);

// If $url is absolute, remove the slashes from the beginning of the path and add the domain of the current page in front of the path

if($is_abs_path === true) {

$formatted_url = $parsed['scheme'] . '://' . $parsed['host'] . '/' . ltrim($formatted_url, '/');

} else {

$formatted_url = $parsed['scheme'] . '://' . $parsed['host'] . dirname($parsed['path']) . '/' . ltrim($formatted_url, '/');

}

}

return $formatted_url;

}

function get_links($page_num) {

global $info, $page_contents, $page_urls;

// Basic URL filtering

preg_match_all('#<a(.*)href=("|\')([^\#\?].*)("|\')(.*)>(.*)</a>#imsU', $page_contents[$page_num], $matches);

foreach($matches[3] as $key => $match) {

// Get the URL for each link

$info[$page_num]['links']['url'][] = format_abs_url($match, $page_urls[$page_num]);

}

$temp_link_array = array_unique($info[$page_num]['links']['url']);

$info[$page_num]['links']['url'] = explode('[#SexyCoding#]', implode('[#SexyCoding#]', $temp_link_array));

unset($temp_link_array);

return 0;

}

function get_images($page_num) {

global $info, $page_contents, $page_urls;

// Basic img URL filtering

preg_match_all('#<img(.*)src=("|\')([^\#\?].*)("|\')(.*)>#imsU', $page_contents[$page_num], $matches);

foreach($matches[3] as $key => $match) {

// Get the URL for each link

$info[$page_num]['images']['url'][] = format_abs_url($match, $page_urls[$page_num]);

}

$temp_link_array = array_unique($info[$page_num]['images']['url']);

$info[$page_num]['images']['url'] = explode('[#SexyCoding#]', implode('[#SexyCoding#]', $temp_link_array));

unset($temp_link_array);

return 0;

}

function get_meta($page_num) {

global $info, $page_urls;

$info[$page_num]['meta'] = get_meta_tags($page_urls[$page_num]);

return 0;

}

function cmp($a, $b) {

if ($a == $b) {

return 0;

}

return ($a < $b) ? -1 : 1;

}

function get_imp_kw($page_num) {

global $info, $page_contents;

foreach($info as $pn => $page) {

$words = str_word_count($page_contents[$pn], 1);

$info[$pn]['kw'] = array();

foreach($words as $word) {

// Here, you could check the word against a list of words not to use as keywords: 'the', 'a', HTML tags, etc.

if(!in_array($word, $info[$pn]['kw']['words'])) {

$info[$pn]['kw']['words'][] = $word;

$info[$pn]['kw']['count'][] = 1;

} else {

$array_key = array_search($word, $info[$pn]['kw']['words']);

$info[$pn]['kw']['count'][$array_key]++;

}

uasort($info[$pn]['kw']['count'], 'cmp');

}

}

}

function get_all_info($page_num) {

get_page_title($page_num);

get_meta($page_num);

get_xml_language($page_num);

get_page_language($page_num);

get_links($page_num);

get_images($page_num);

get_imp_kw($page_num);

}

/**

* -------------------------------- FUNCTIONS END --------------------------------

**/

/**

* -------------------------------- GET INFO START --------------------------------

**/

// Check if links from page 0 should be crawled as well or not

if($crawl_all == 0) {

get_all_info(0);

print_r($info);

exit;

} elseif ($crawl_all == 1) {

get_links(0);

}

// Foreach link found on the first page...

foreach($info[0]['links']['url'] as $new_url) {

// Add the link to the list of links to crawl

$page_urls[] = $new_url;

}

// Unset $page_contents so the first page isn't in it, so that we basically crawl the new URL list

unset($page_contents);

$page_contents = array();

// Foreach url from $page_urls

foreach($page_urls as $page_url_key => $page_url) {

// Get the content from that url and put it into the $page_contents array

$page_contents[] = get_page_content($page_url);

}

// Finally, get all information for all crawled pages

foreach($page_contents as $con_key => $content) {

get_all_info($con_key);

}

// Echo crawled URLs and the information for each of these URLs undernieth

print_r($page_urls);

print_r($info);

/**

* -------------------------------- GET INFO END --------------------------------

**/[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:56 pm

Entry #9

Post by **jacek** » Mon Oct 31, 2011 11:57 pm

Entry #8

[syntax=php]<html>

<head>

<title>PHP Web Crawler Recon</title>

</head>

<body>

<style type="text/css">

.results_box {

background-color:#FFFEE0;

overflow: auto;

}

.headers {

font-size:22px;

}

.results_format{

border-style:ridge;

border-width:5px;

}

body {

background-color:#F9F4E4;

}

</style>

<form action="crawl.php" name="crawler" method="POST">

URL to crawl: <input type="text" name="url" value="http://www.google.com">

<input type="submit" name="crawlSubmit" value="Crawl!">

</form>

<?php

//Simple PHP reconnaissance tool, written for betterphp.co.uk competition

Hope I do well Jacek .{-_-}.

//PLEASE Make sure you run this on your 'local' web server for all functionality to work

class Recon {

public $urlGiven; //User defined

var $ipAddressGiven; //Resolved by gethostbyname

/*********************************

* Url Validation & Errors *

* Validates URL and gives *

* two kinds of errors warn *

* and die, a.k.a fatal. *

* *

*********************************/

public function validateForm($url, $formSubmitted)

{

$regex = "@^http(s)?://[a-z0-9-]+(.[a-z0-9-]+)*(:[0-9]+)?(/.*)+$@i"; //Restriced URL to match .com"/test" "not allowed"

$is_valid_url = preg_match($regex, $url);

if($is_valid_url === 1) //If bad URL entered die

{

$this->errorDie("Please enter a valid URL: Format accepted 'http://www.*.com'");

}

}

public function errorWarn($error_string) //Used to warn, but not stop operation

{

echo $error_string . "<br>";

}

public function errorDie($die_string) //Fatal warning such as bad URL, stop execution

{

die($die_string);

}

/*********************************

* grabLinks() *

* Grabs links that are pre- *

* sent in URL homepage. *

* *

* *

*********************************/

public function grabLinks() {

$urlGiven = $this->urlGiven;

if(!empty($urlGiven))

{

$links = @file_get_contents($urlGiven); //Surpressed but error handled.

$regex = "@href[ ]*=[ ]*('|\")([^\"'])*('|\")@"; //Regex searches for <a href> tags

preg_match_all($regex, $links, $linksRetrievedArr); //Matches all a href's

$links_count = count($linksRetrievedArr[0]); //Count amount matched

//Cycle and clean href tags, then output all found

echo "<div style='height:250; width:500;' class='results_box'>";

for($i = 0; $i < $links_count; $i++) //Repeat for every link in array

{

$regex = "@href[ ]*=[ ]*('|\")@"; //Splits on href to get clean URL

$cleaned_href = preg_split($regex, $linksRetrievedArr[0][$i]); //Cycle through matches [array][url]

$regex = "@['|\"]@"; //Searches for trailing ", left behind on first regex

$cleaned_href = preg_split($regex, $cleaned_href[1]); //Split the ", we now have a fully clean path or url

echo htmlentities($cleaned_href[0]) . "<br>";

}

echo "</div>";

}

}

/*********************************

* grabIP() *

* Gets host IP address *

* achieved via Reverse-DNS *

* gethostbyname() *

* *

*********************************/

public function grabIP()

{

//Get remote server IP address

$cleanURL = $this->urlGiven;

$regex = "@^(?:http://)?([^/]+)@i"; //Mathches http

preg_match_all($regex, $cleanURL, $cleanedURL);

(array) $cleaned = $cleanedURL[1][0];

$serverip = gethostbyname($cleaned); //URL cleaned to www for gethostbyname (!http://)

$this->ipAddressGiven = $serverip; //Storing to property

echo "<div style='height:20; width:500;' class='results_box'>";

echo "Server IP Address: " . htmlentities($serverip) . "<br>"; //Output IP resolved

echo "</div>";

}

/*********************************

* grabServers() *

* Grabs MX and NS servers *

* associated with target *

* URL address. *

* *

*********************************/

public function grabServers()

{

$clean_URL_server = $this->urlGiven;

$regex = "@^(http://)www\.@"; //Strip http://www.

$cleaned = preg_split($regex, $clean_URL_server);

$cleaned = $cleaned[1]; //This is the cleaned URL after split

$servers_found = @dns_get_record($cleaned); //Grab all DNS records

$server_count = count($servers_found);

echo "<div class='results_box' style='height:100; width:500;'>";

for($i = 0; $i < $server_count; $i++) //Cycle and find different types

{

if($servers_found[$i]['type'] == "MX") //Check for MX Servers found

{

(array) $get_server_name = $servers_found[$i]['target'] . "<br>";

echo "MX Server: " . "<i>" . $get_server_name . "</i>" . "<br>";

}

}

for($i = 0; $i < $server_count; $i++)

{

if($servers_found[$i]['type'] == "NS") //Check for NS Servers found

{

(array) $get_server_name = htmlentities($servers_found[$i]['target']) . "<br>";

echo "NS: " . "<i>" . $get_server_name . "</i>" . "</br>";

}

}

echo "</div>";

}

/*********************************

* grabHeaders() *

* Grabs current HTTP Hea- *

* der information, will *

* try and force 403 or similar *

* *

*********************************/

public function grabHeaders()

{

$url = $this->urlGiven;

$ch = curl_init(); //Start cURL

curl_setopt($ch, CURLOPT_URL, $url); //Target

curl_setopt($ch, CURLOPT_HEADER, true);

curl_setopt($ch, CURLOPT_NOBODY, true);

curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); //Grab returned data

curl_setopt($ch, CURLOPT_TIMEOUT, 10); //Time out after 10 seconds

$headers = curl_exec($ch); //Execute and store result

echo "<div class='results_box' style='height:200; width:500;'>";

htmlentities(print_r($headers)); //Print headers returned safely

echo "</div>";

}

/*********************************

* grabSSH() *

* Grabs ssh server version. *

* This is grabbed from banner *

* so do not trust 100%. *

* *

*********************************/

//THIS WILL NOT WORK ON A WEB SERVER UNLESS YOU HAVE DEDICATED ACCESS, OR HAVE THE POWER TO OPEN LOCAL PORTS.

//THIS WILL WORK ON YOUR LOCAL SYSTEM, ENSURE YOU HAVE THE PORTS NOT BLOCKED.

public function grabSSH($url, $port)

{

$url = str_replace("http://", "", $url); //Clean for fsock

$sock = @fsockopen($url, $port, $errno, $errstr, 15); //Open connection to TARGET on PORT with timeout of 15 secs

if(!$sock)

{

$this->errorWarn("Connection to target failed, host could be blocking our attempts"); //Catch failed connection

}

else

{

echo "<div class='results_box' style='height:100; width:500;'>";

echo htmlentities(fgetss($sock, 150)); //Safely output returned data.

echo "</div>";

fclose($sock); //Close socket

}

}

/*********************************

* grabFTP() *

* Grabs ftp server version. *

* This is grabbed from banner *

* so do not trust 100%. *

* *

*********************************/

//THIS WILL NOT WORK ON A WEB SERVER UNLESS YOU HAVE DEDICATED ACCESS, OR HAVE THE POWER TO OPEN LOCAL PORTS.

public function grabFTP($url, $port)

{

$url = str_replace("http://", "", $url); //Clean for fsock

$sock = @fsockopen($url, $port, $errno, $errstr, 15); //Open connection to TARGET on PORT with a timeout of 15 secs

if(!$sock)

{

$this->errorWarn("Connection to target failed, host could be blocking our attempts, or authentication needed!"); //Catch failed connection

}

else

{

echo "<div class='results_box' style='height:100; width:500;'>";

echo fgetss($sock, 150); //Safely output returned data.

echo "</div>";

fclose($sock); //Close socket

}

}

/*********************************

* whois() *

* Grabs various domain info *

* ONLY GODADDY compatible *

* this is to change. *

* *

*********************************/

public function whois($query, $server)

{

$query = preg_replace("@^http://www.@", "", $query);

define('AE_WHOIS_TIMEOUT', 15); //Connection timeout

global $ae_whois_errno, $ae_whois_errstr;

//Connecting

$f = fsockopen($server, 43, $ae_whois_errno, $ae_whois_errstr, AE_WHOIS_TIMEOUT);

if (!$f)

return false; // connection failed

//Sending query

fwrite($f, $query."\r\n");

//Receving response

$response = '';

while (!feof($f))

$response .= fgets($f, 1024);

//Closing connection

fclose($f);

echo "<div class='results_box' style='height:200; word-wrap:break-word; width:500'>";

echo $response;

echo "</div>";

return $response;

}

//End of Recon Class

}

/*******************

* *

* OUTPUT *

* *

*******************/

//Check form subbed & URL not blank, if so continue.

$formSubmitted = (isset($_POST['crawlSubmit']))? 1 : 0;

$urlGiven = (isset($_POST['url']))? htmlentities($_POST['url']) : NULL;

if($formSubmitted == 1 && $urlGiven != NULL)

{

$crawler = new Recon();

/*******************

* Validate URL *

* & *

* Form *

*******************/

$crawler->validateForm($urlGiven, $formSubmitted); //Checks for valid URL

$crawler->urlGiven = $urlGiven; //If valid then store to class property

echo "<div class='results_format'>";

/*******************

* Grab Links *

* (Crawl) *

* *

*******************/

echo "<div class='headers'> URL's Grabbed From Site </div> <br>";

$crawler->grabLinks(); //Grabs links

echo "<br>";

/*******************

* *

* Grab IP Addy *

* *

*******************/

echo "<div class='headers'> IP Address </div> <br>";

$crawler->grabIP(); //Reverse-DNS to generate IP

echo "<br>";

/*******************

* *

* HTTP Headers *

* *

*******************/

echo "<div class='headers'> Header Info </div> <br>";

$crawler->grabHeaders(); //Grabs header info (403 to be forced soon)

echo "<br>";

/*******************

* *

* MX & NS found *

* *

*******************/

echo "<div class='headers'> Servers found! </div> <br>";

$crawler->grabServers(); //Find MX and NS server hostnames (IP TO BE INCLUDED SOON)

echo "<br>";

/*******************

* *

* Grab SSH Ver. *

* *

*******************/

echo "<div class='headers'> SSH Banner Grabbed (Don't rely on this, Nmap is more accurate)</div>";

$crawler->grabSSH($urlGiven, '22');

echo "<br>";

/*******************

* *

* Grab FTP Ver. *

* *

*******************/

echo "<div class='headers'> FTP Banner Grabbed (Don't rely on this, Nmap is more accurate)</div>";

$crawler->grabFTP($urlGiven, '21');

echo "<br>";

/*******************

* *

* Whois Results *

* *

*******************/

echo "<div class='headers'> Whois Info </div>";

$whois_response = $crawler->whois($urlGiven, 'whois.godaddy.com'); //Searches for URL given in the GoDaddy whois records

if(strlen($whois_response) < 100)

{

$crawler->errorWarn('There was an error contacting the whois database. Only GoDaddy domains available.');

}

//Catching the horrible whois server error, and making it neat.

echo "</div>";

}

//Have fun Jacek

?>

</body>

</html>[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:57 pm

Entry #6

[syntax=php]<?php

function get_html($website) {

return file_get_contents($website);

}

function get_tags($website) {

$meta_tags = get_meta_tags($website);

$keywords = $meta_tags['keywords'];

return $keywords;

}

function get_desc($website) {

$meta_tags = get_meta_tags($website);

$desc = $meta_tags['description'];

return $desc;

}

function make_array($url) {

$tags = get_tags($url);

$desc = get_desc($url);

$array['meta']['keywords'] = $tags;

$array['meta']['description'] = $desc;

$reg = "<a\s[^>]*href\s*=\s*([\"\']??)([^\" >]*?)\\1[^>]*>(.*)<\/a>";

preg_match_all("/$reg/siU", get_html($url), $links, PREG_SET_ORDER);

$i = 1;

foreach($links as $link) {

if(strstr($link[2], "http://") == false) {

$link[2] = $url . $link[2];

}

$array['links'][$i]['name'] = $link[3];

$array['links'][$i]['url'] = $link[2];

$i++;

}

return $array;

}

echo "<pre>", print_r(make_array("http://betterphp.co.uk/")), "</pre>";

?>[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:58 pm

Entry #5

Was stolen from here http://www.mpgh.net/forum/198-web-langu ... nutes.html

Post by **jacek** » Mon Oct 31, 2011 11:58 pm

Entry #4

[syntax=php]<?php

//make the function
function crawler($url){
//The array(s)
$array = array();
$links = array();
$urlname = array();
$urls = array();

$meta = get_meta_tags($url);
if(!$meta){
$meta = "Sorry!, no meta-tags are found!";
}
$parsedurl = parse_url($url);
$h = file_get_contents($url);
$dom = new DOMDocument();
@$dom->loadHTML($h);

$p = new DOMXPath($dom);
$hrefs = $p->evaluate("/html/body//a");

for ($i = 0; $i < $hrefs->length; $i++){
$href = $hrefs->item($i);
$u = $href->getAttribute('href');
$n = $href->getAttribute('title');
$urlname['name'] = $n;
$urls['url'] = $u;
$links[] = array_merge((array)$urlname, (array)$urls);
}

$array['meta'] = $meta;
$array['info'] = $parsedurl;
$array['links'] = $links;

echo "<pre>";
print_r($array);
echo "</pre>";

}

//test the function
crawler("http://betterphp.co.uk");
?>[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:58 pm

Entry #2

[syntax=php]<?php

error_reporting(E_ALL ^ E_WARNING);

if(isset($_GET['url'])) {

$dom = new DOMDocument();

if($dom->loadHTMLFile($_GET['url'])) {

$html = array();

$html['title'] = $dom->getElementsByTagName('title')->item(0)->nodeValue;

$metas = $dom->getElementsByTagName('meta');

foreach($metas AS $meta) {

if(in_array($meta->getAttribute('name'), array('keywords', 'description'))) {

$html['meta'][$meta->getAttribute('name')] = $meta->getAttribute('content');

}

}

$anchors = $dom->getElementsByTagName('a');

foreach($anchors AS $anchor) {

$html['links'][] = array(

'text' => $anchor->nodeValue,

'href' => $anchor->getAttribute('href'),

'title/alt' => ( ($anchor->getAttribute('title'))? $anchor->getAttribute('title') : $anchor->getAttribute('alt') )

);

}

echo '<pre>' , htmlspecialchars(print_r($html,true)) , '</pre>';

}

else

echo 'could not load file';

} else

echo 'usage: ?url=htmlfiletocrawl';

?>[/syntax]

Post by **jacek** » Mon Oct 31, 2011 11:58 pm

Entry #1

[syntax=php]<?php

$un_parsed_url = "http://betterphp.co.uk/"; // your url

$htmlfile = new DOMDocument; // located in internal php code.

$htmlfile->loadHTMLFile($un_parsed_url); // load the html code here (in this case a url).

$domxpath = new DOMXPath($htmlfile); // located in internal php code.

/*

* //li is scaning for <li> tags

* //a scans for <a> tags

* [@href] after the a is the atribute of the <a> tag. (change href to title or something).

*/

$tag = $domxpath->query('//li//a[@href]'); // search for all tags that outputed a href attribute.

$finaloutput = array(); // make an empty array

foreach($tag as $link){

$finalouput[] = array(

'name' => $link->nodeValue,

'url' => $un_parsed_url.$link->getAttribute('href')

); // input url and title to empty array

}

$arr = array(

// get meta tags.

meta => get_meta_tags($un_parsed_url),

// get links

links => $finalouput,

);

// echo

echo '<pre>';

print_r($arr);

echo '</pre>';

?>[/syntax]

BetterPHP

October 2011

October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011

Re: October 2011