サイト収集スクリプトfunction.php

ザウルスNetFrontでも動作するように修正。function.phpを直接開いた際にフォームを表示するようにした。PATH項目が登録されないバグも回避できないか、試しにSLEEPを入れてみた。効果のほどは確認していない。

<?
define("SAVE_DIR",	"./data");
define("DB_HOST",	DB_HOST);
define("DB_USER",	DB_USER);
define("DB_PASS",	DB_PASS);
define("DB_NAME",	DB_NAME);
define("WGET_CMD",	"/usr/local/bin/wget");
define("RM_CMD",	"/bin/rm");
define("ZIP_CMD",	"/usr/local/bin/zip");
define("LS_CMD",	"/bin/ls");
 
if( $_GET[mode] == "select" ){
	select( $_GET[update_check] );
	return;
}
 
if( isset($_GET[url]) ){
	$src = urldecode($_GET[url]);
	switch( $_GET[mode] ){
		case "wget":
			wget( $src );
			$title = mb_convert_encoding($_GET[title], 'UTF-8', 'auto');
			$comment = mb_convert_encoding($_GET[comment], 'UTF-8', 'auto');
			sleep(1);
			dbput( $src, $title, $comment );
			if( !isset($_GET[ajax]) || $_GET[ajax] == "true" ){
			}else{
				nonAjax();
			}
			break;
		case "rmdir":
			dbdel( $src );
			if( !isset($_GET[ajax]) || $_GET[ajax] == "true" ){
			}else{
				nonAjax();
			}
			break;
		case "title":
			echo getTitle( $src );
			break;
		case "zip":
			if( !isset($_GET[ajax]) || $_GET[ajax] == "true" ){
				echo zip( $src );
			}else{
				header("Location: ".zip($src));
				nonAjax();
			}
			break;
	}
}else{
	nonAjax();
}
 
function nonAjax(){
	$con = mysql_connect(DB_HOST,DB_USER,DB_PASS);
	if( !$con || !mysql_select_db(DB_NAME) ){
		mysql_close($con);
		echo(mysql_error());
		return;
	}
 
	$sql = "";
	$sql .= " select `URL`,`TITLE` from site_list order by `URL`";
	if( !($rs = mysql_query($sql)) ){
		mysql_close($con);
		echo(mysql_error());
		return;
	}
 
	$option = "";
	while( $item = mysql_fetch_array($rs) ){
		$url	= $item['URL'];
		$title	= $item['TITLE'];
		$option .= "<option value='".$url."'>".$title."</option>";
	}
	mysql_close($con);
 
print <<<EOF
<html>
<head>
<title>wget</title>
</head>
<body>
 
<form method="GET" action="function.php">
<input type="hidden" name="ajax" value="false">
<input type="hidden" name="mode" value="wget">
<table border="0">
<tr><th>URL</th><td><input id="url" type="text" size="60" name="url" value="http://">
</td></tr>
<tr><th>タイトル</th><td><input id="title" type="text" size="60" name="title" value="">
</td></tr>
<tr><th>コメント</th><td><input id="comment" type="text" size="60" name="comment" value="">
<input type="submit" value="wget実行">
</td></tr>
</table>
</form>
 
<form method="GET" action="function.php">
<input type="hidden" name="ajax" value="false">
<input type="hidden" name="mode" value="zip">
<select id="url" name="url">{$option}</select>
<input type="submit" value="圧縮">
</form>
 
<form method="GET" action="function.php">
<input type="hidden" name="ajax" value="false">
<input type="hidden" name="mode" value="rmdir">
<select id="url" name="url">{$option}</select>
<input type="submit" value="削除">
</form>
 
<a href=".">back</a>
 
</body>
</html>
EOF;
}
 
function wget( $url ){
	if( ereg("^http://",$url) ){
		$domain = parse_url($url);
 
		$cmd = WGET_CMD." -b -r -k -np -q -t 3 -S -N -a wget.log -P ".SAVE_DIR." -D".$domain[host]." ".$url;
		exec($cmd, $ret, $retCode);
		
		return $ret;
	}
	return false;
}
 
function dbput( $url, $title = "", $comment = "" ){
	if( ereg("^http://",$url) ){
		$domain = parse_url(ereg_replace("~","%7E",$url));
		$con = mysql_connect(DB_HOST,DB_USER,DB_PASS);
		if( !$con || !mysql_select_db(DB_NAME) ){
			mysql_close($con);
			echo(mysql_error());
			return false;
		}
		
		$dir = realpath( SAVE_DIR."/".$domain[host]."/".$domain[path] );
		if( !is_dir($dir) ){
			$dir = dirname($dir);
		}
		
		$sql = "";
		$sql .= " select	*";
		$sql .= " from		`site_list`";
		$sql .= " where		`URL`	= '".$url."'";
		$sql .= " 	 or		`PATH`	= '".$dir."'";
		//echo $sql;
		if( !($rs = mysql_query($sql)) ){
				mysql_close($con);
				echo(mysql_error());
				return false;
		}
		
		if( mysql_num_rows($rs) == 0 ){
			$sql = "";
			$sql .= " insert into `site_list`";
			$sql .= " values(";
			$sql .= "	'".$url."',";
			$sql .= "	'".$dir."',";
			$sql .= "	'".$title."',";
			$sql .= "	'".$comment."',";
			$sql .= "	NOW()";
			$sql .= " )";
			//echo $sql;
			if( !($rcd = mysql_query($sql)) ){
				mysql_close($con);
				echo(mysql_error());
				return false;
			}
		}else{
			$sql = "";
			$sql .= " update `site_list`";
			$sql .= " set ";
			$sql .= "	`URL`		= '".$url."',";
			$sql .= "	`PATH`		= '".$dir."',";
			$sql .= "	`TITLE`		= '".$title."',";
			$sql .= "	`COMMENT`	= '".$comment."',";
			$sql .= "	`UPDATE`	= NOW()";
			$sql .= " where	`URL`	= '".$url."'";
			$sql .= " 	 or	`PATH`	= '".$dir."'";
			//echo $sql;
			if( !($rcd = mysql_query($sql)) ){
				mysql_close($con);
				echo(mysql_error());
				return false;
			}
		}
		
		mysql_close($con);
		return true;
	}
	return false;
}
 
function dbdel( $url ){
	if( ereg("^http://",$url) ){
		$domain = parse_url($url);
		$con = mysql_connect(DB_HOST,DB_USER,DB_PASS);
		if( !$con || !mysql_select_db(DB_NAME) ){
			mysql_close($con);
			echo(mysql_error());
			return false;
		}
		
		$dir = SAVE_DIR."/".$domain[host]."/".$domain[path];
		$dir = realpath(ereg_replace("~","%7E",$dir));
		if( !is_dir($dir) ){
			$dir = dirname($dir);
		}
		exec(RM_CMD." -rf ".$dir, $ret, $retCode);
		
		$sql = "";
		$sql .= " delete";
		$sql .= " from		`site_list`";
		$sql .= " where		`URL`	= '".$url."'";
		$sql .= " 	 or		`PATH`	= '".$dir."'";
		//echo $sql;
		if( !($rs = mysql_query($sql)) ){
				mysql_close($con);
				echo(mysql_error());
				return false;
		}
		
		mysql_close($con);
		return true;
	}
	return false;
}
 
function getTitle( $url ){
	if( ereg("^http://",$url) ){
		$text = file_get_contents( $url );
		if( eregi("<title>.+</title>", $text, $regs) ){
			$title = eregi_replace("<.?title>", "", $regs[0]);
			$title = mb_convert_encoding($title, 'UTF-8', 'auto');
			return $title;
		}
	}
	return "";
}
 
function zip( $url ){
	$domain = parse_url($url);
	$dir = SAVE_DIR."/".$domain[host]."/".$domain[path];
	$dir = realpath(ereg_replace("~","%7E",$dir));
	if( !is_dir($dir) ){
		$dir = dirname($dir);
	}
	$file = basename($dir);
	$dir = toRelative("./data",$dir);
	
	exec(RM_CMD." *.zip", $ret, $retCode);
	exec("cd ".SAVE_DIR.";".ZIP_CMD." -9 -r ../".$file.".zip ".$dir.";cd -", $ret, $retCode);
	if( $ret ){
		return $file.".zip";
	}
 
	return "";
}
 
function toRelative( $base, $target ){
	if( is_file($base) ){
		$base = dirname($base);
	}
	$aryB = split( "/", realpath($base) );
	$aryT = split( "/", realpath($target) );
 
	while( $aryB[0] == $aryT[0] ){
		array_shift( $aryB  );
		array_shift( $aryT );
	}
 
	$path = "";
	while (list($key, $val) = each($aryB)) {
		$path .= "../";
	}
	if( strlen($path) == 0 ){
		$path = "./";
	}
	$path .= join("/",$aryT);
 
	return $path;
}
 
function lsdir( $path ){
	$str = "";
 
	exec(LS_CMD." -F ".$path, $ret, $retCode);
	while( list($key, $value) = each($ret) ){
		if( ereg("/$",$value) ){
			if( strlen($str) != 0 ){
				$str .= ",";
			}
			$str .= $value;
		}
	}
	
	return explode( ",", $str );
}
 
function select( $update_check = "0" ){
	$con = mysql_connect(DB_HOST,DB_USER,DB_PASS);
	if( !$con || !mysql_select_db(DB_NAME) ){
		mysql_close($con);
		echo(mysql_error());
		return;
	}
	
	$sql = "";
	$sql .= " select	*,";
	$sql .= "		UNIX_TIMESTAMP(`UPDATE`) AS UNIX_TIME";
	$sql .= " from		site_list";
	$sql .= " order		by URL";
	if( !($rs = mysql_query($sql)) ){
		mysql_close($con);
		echo(mysql_error());
		return;
	}
 
	header( "Content-type: text/xml; charset=UTF-8" );
	print( "<?xml version='1.0' encoding='UTF-8' ?>\n" );
	print( "<sites>\n" );
	while( $item = mysql_fetch_array($rs) ){
		$url		= $item['URL'];
		$path		= $item['PATH'];
		$title		= $item['TITLE'];
		$comment	= $item['COMMENT'];
		$update		= $item['UPDATE'];
		$unix_time	= $item['UNIX_TIME'];
		$domain 	= parse_url($url);
		
		print( "<site" );
		print( " url='".urldecode($url)."'" );
		print( " title='".$title."'" );
		print( " comment='".$comment."'" );
		print( " path='".ereg_replace("%7E","%257E",toRelative(".",$path))."'" );
		print( " update='".$update."'");
		if( $update_check == "1" ){
			if( $unix_time < getLastModified($url) ){
				print( " updated='true'");
			}else{
				print( " updated='false'");
			}
		}
		print( "/>\n" );
	}
	print( "</sites>\n" );
	
	mysql_close($con);
}
 
function getLastModified( $url ){
	if( ereg("^http://",$url) ){
		$domain = parse_url($url);
		$file = ereg_replace( "/$", "", $domain[host].$domain[path] );
 
		$fp = fsockopen( $file, 80, $errno, $errstr, 30 );
		if( !$fp ){
			return -1;
		}else{
			$out = "HEAD / HTTP/1.1\r\n";
			$out .= "Host: ".$domain[host]."\r\n";
			$out .= "Connection: Close\r\n\r\n";
 
			fwrite( $fp, $out );
			while( !feof($fp) ){
				$str = fgets( $fp, 128 );
				if( strlen($str) == 0 ){
					fclose($fp);
					return -1;
				}
				if( eregi( "Last-Modified:.*", $str, $regs ) ){
					$date = strtotime(ereg_replace("Last-Modified: ","",$str));
					fclose($fp);
					return $date;
				}
			}
			fclose($fp);
		}
	}
	return -1;
}
?>