對 wget 這個 tool 不熟,平常使用 wget 下載一些資料時,可以輕易地使用 --referer 來偽造 HTTP Header 資料,因此能夠通過對方 Server 檢查
wget --referer="REFERER_URL" "TARGET_URL"
然而,上述的 REFERER_URL 和 TARGET_URL 都是固定的位置,如果是會根據 session / cookie 的而改變的話,不曉得還有沒有辦法?對我而言,寫 PHP 比去看 manpage 來得快 XD 所以我就寫成 PHP 囉!或許\ wget 也有更方便的下法吧,改天再努力看 manpage
程式碼:
<?php
$output_file = \'result.file\'; // 儲存結果
$cookie_file = \'cookie.tmp\'; // cookie file
$source_url = \'SOURCE_URL\'; // 之後會變成 REFERER_URL
$pattern = \'/class="download" href="(.*?)"/\'; // 此為一個範例, 用來撈 TARGET_URL
$ch = curl_init();
curl_setopt( $ch , CURLOPT_URL, $source_url );
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
curl_setopt( $ch , CURLOPT_RETURNTRANSFER , true );
$result = curl_exec( $ch );
if( preg_match_all( $pattern , $result , $match ) )
{
if( isset( $match[1][1] ) )
{
$target_url = $match[1][1]; // 請依 pattern 決定
$referer_url = $source_url;
curl_setopt( $ch , CURLOPT_URL, $target_url );
curl_setopt( $ch , CURLOPT_REFERER , $referer_url );
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
//curl_setopt( $ch , CURLOPT_RETURNTRANSFER , true );
$fp = fopen ( $output_file, \'wb\' );
curl_setopt( $ch , CURLOPT_FILE , $fp );
echo "GO...\\n";
curl_exec( $ch );
echo "Finish..\\n";
fclose( $fp );
}
}
curl_close( $ch );
?>
以上是要從 SOURCE_URL 上頭, 找到下載位置(target_url), 然而, 那個位置卻每次都不一樣, 最重要的是跟 session 有關係並且下載 target_url 時還必須奉上 cookie 資訊, 所以, 先收集一下 cookie 囉!(上述程式並不謹慎, 例如儲存結果的檔案有可能開檔失敗)
後記,無聊又改寫成 tool mode:
<?php
$shortopt = array();
$shortopt[\'h\'] = array(
\'value\' => \'\' ,
\'text\' => \'-h, help\' );
$shortopt[\'c:\'] = array(
\'value\' => \'\' ,
\'text\' => "-c \'/tmp/cookie_file\' , tmp file for cookie" );
$shortopt[\'o:\'] = array(
\'value\' => \'\' ,
\'text\' => "-o \'/tmp/output_file\' , path for result file. default use stdout" );
$shortopt[\'u:\'] = array(
\'value\' => NULL ,
\'text\' => "-u \'http://www.google.com\' , source url" );
$shortopt[\'e:\'] = array(
\'value\' => NULL ,
\'text\' => "-e \'/class=\\"normal-down\\" href=\\"(.*?)\\"/is\' , regexp pattern for extract the target url" );
$shortopt[\'m:\'] = array(
\'value\' => \'\' ,
\'text\' => "-m \'1,1\' , choose the result matched to be used. e.g. use the match[5][2] is \'5,2\'" );
$shortopt[\'d\'] = array(
\'value\' => \'true\' ,
\'text\' => "-d , disable test mode for showing the target matched by regexp pattern" );
// check function
if( !function_exists( \'getopt\' ) )
{
echo "\'getopt\' is not supported in current PHP version.\\n";
exit;
}
// help menu
$shortopt_list = \'\';
$shottopt_help = \'\';
foreach( $shortopt as $k => $v )
{
$shortopt_list .= $k;
$shottopt_help .= "\\t".$v[\'text\']."\\n";
}
// start to parse...
$parse_arg = getopt( $shortopt_list );
// show help
if( isset( $parse_arg[\'h\'] ) )
{
echo "Usage> php ".$argv[0]." -h\\n";
echo $shottopt_help;
exit;
}
// set the value
foreach( $parse_arg as $k => $v )
{
if( isset( $shortopt[$k] ) )
$shortopt[$k][\'value\'] = !strcasecmp( $shortopt[$k][\'value\'] , \'false\' ) ? true : false ;
else if( isset( $shortopt[$k.\':\'] ) )
$shortopt[$k.\':\'][\'value\'] = $v;
}
// check value (cannot be NULL)
$check_out = \'\';
foreach( $shortopt as $k => $v )
if( !isset( $v[\'value\'] ) )
$check_out .= "\\t".$v[\'text\']."\\n";
if( !empty( $check_out ) )
{
echo "Usage> php ".$argv[0]." -h\\n";
echo "Must Set:\\n$check_out\\n";
exit;
}
$cookie_file = !empty( $shortopt[\'c:\'][\'value\'] ) ? $shortopt[\'c:\'][\'value\'] : NULL ;
$source_url = $shortopt[\'u:\'][\'value\'];
$output_file = !empty( $shortopt[\'o:\'][\'value\'] ) ? $shortopt[\'o:\'][\'value\'] : NULL ;
$regexp_pattern = $shortopt[\'e:\'][\'value\'];
if( !empty( $shortopt[\'m:\'][\'value\'] ) )
$shortopt[\'m:\'][\'value\'] = trim( $shortopt[\'m:\'][\'value\'] );
$choose_match = !empty( $shortopt[\'m:\'][\'value\'] ) ? explode( \',\' , $shortopt[\'m:\'][\'value\'] ) : NULL;
$test_mode = empty( $choose_match ) || $shortopt[\'d\'][\'value\'];
$ch = curl_init();
curl_setopt( $ch , CURLOPT_URL, $source_url );
if( !empty( $cookie_file ) )
{
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
}
curl_setopt( $ch , CURLOPT_RETURNTRANSFER , true );
$result = curl_exec( $ch );
if( preg_match_all( $regexp_pattern , $result , $matches ) )
{
$target_url = getTargetURL( $matches , $choose_match );
if( $test_mode || empty( $target_url ) )
{
echo "Matched Target URL: \\n";
print_r( $matches );
echo "Choose option(Cannot be empty):".$shortopt[\'m:\'][\'value\']."\\n";
echo "Target(Cannot be empty):$target_url\\n";
}
else
{
curl_setopt( $ch , CURLOPT_URL, $target_url );
curl_setopt( $ch , CURLOPT_REFERER , $source_url );
if( !empty( $cookie_file ) )
{
curl_setopt( $ch , CURLOPT_COOKIEFILE , $cookie_file );
curl_setopt( $ch , CURLOPT_COOKIEJAR , $cookie_file );
}
if( !empty( $output_file ) )
{
echo "Target URL:$target_url\\n";
echo "Referer URL:$source_url\\n";
if( ( $fp = fopen ( $output_file , \'wb\' ) ) == NULL )
{
echo "ERROR: Cannot open the output file to write:$output_file\\n";
exit;
}
curl_setopt( $ch , CURLOPT_FILE , $fp );
echo "Begin...\\n";
curl_exec( $ch );
echo "...Finish\\n";
fclose( $fp );
}
else
{
curl_exec( $ch );
}
}
}
curl_close( $ch );
exit;
function getTargetURL( $matches , $choose )
{
if( !isset( $matches ) )
return NULL;
if( is_array( $matches ) && is_array( $choose ) && count( $choose ) > 0 )
{
$index = array_shift( $choose );
if( isset( $matches[ $index ] ) )
return getTargetURL( $matches[ $index ] , $choose );
return NULL;
}
if( !is_array( $matches ) )
return $matches;
else if( isset( $matches[ $choose ] ) )
return $matches[ $choose ];
return NULL;
}
?>
用法:
單純以抓 Yahoo! New 為例
尚未指定 -m
# php my_wget.php -u \'http://tw.yahoo.com\' -e \'/<h3><a href="([^"]+)" title="([^"]+)"/is\'
Matched Target URL:
Array
(
[0] => Array
(
[0] => <h3><a href="news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html" title="莫拉克風災 學者:無關暖化"
[1] => <h3><a href="news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html" title="立院藏七寶 總價數億元"
)
[1] => Array
(
[0] => news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html
[1] => news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html
)
[2] => Array
(
[0] => 莫拉克風災 學者:無關暖化
[1] => 立院藏七寶 總價數億元
)
)
Choose option(Cannot be empty):
Target(Cannot be empty):
指定 -m \'1,1\'
# php my_wget.php -u \'http://tw.yahoo.com\' -e \'/<h3><a href="([^"]+)" title="([^"]+)"/is\' -m \'1,1\'
Matched Target URL:
Array
(
[0] => Array
(
[0] => <h3><a href="news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html" title="莫拉克風災 學者:無關暖化"
[1] => <h3><a href="news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html" title="立院藏七寶 總價數億元"
)
[1] => Array
(
[0] => news/a/h1/t/*http://tw.news.yahoo.com/article/url/d/a/100628/5/289yr.html
[1] => news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html
)
[2] => Array
(
[0] => 莫拉克風災 學者:無關暖化
[1] => 立院藏七寶 總價數億元
)
)
Choose option(Cannot be empty):1,1
Target(Cannot be empty):news/a/h2/t/*http://tw.news.yahoo.com/article/url/d/a/100628/69/289tr.html
正式要下載請記得加 -d (disable test) , 但此例不適用, 因為抓出來的 url 並不完整, 開頭只是 "news/a/h2/t/*....."
# php my_wget.php -u \'http://tw.yahoo.com\' -e \'/<h3><a href="([^"]+)" title="([^"]+)"/is\' -m \'1,1\' -d
輸出到檔案
# php my_wget.php -u \'http://tw.yahoo.com\' -e \'/<h3><a href="([^"]+)" title="([^"]+)"/is\' -m \'1,1\' -d -o \'/tmp/output\'
需要 cookie
# php my_wget.php -u \'http://tw.yahoo.com\' -e \'/<h3><a href="([^"]+)" title="([^"]+)"/is\' -m \'1,1\' -d -o \'/tmp/output\' -c \'/tmp/cookie\'
|