合法優良搬家公司 價格透明,合約保障真安心 | 居家清潔請找 易祺清潔公司 |
[php] 簡單擷取 html 原始碼 |
房東:小情 發表時間:2011-03-17 | [檢舉] |
來介紹一套解析 html 原始碼的 open source: PHP Simple HTML DOM Parser,這套程式可以讓您任意對原始碼進行擷取操作,抓取一些您需要的資訊,在搭配 preg_match 跟 preg_match_all 函數來使用,使用方法可以參考線上 Document,簡單的飯例如下(參考官方網站): // Create a DOM object from a string $html = str_get_html(\'<html><body>Hello!</body></html>\'); // Create a DOM object from a URL $html = file_get_html(\'http://www.google.com/\'); // Create a DOM object from a HTML file $html = file_get_html(\'test.htm\'); 程式提供了三種讓您讀取原始碼,您可以直接丟 $string 或者是網址列,或者是檔案都可以,如果使用過 jQuery 您會發現在擷取 dom 的寫法很像,參考使用說明都寫得很清楚,由於 CodeIgniter 沒有此功\能,所以我把程式改了一下 porting 到 CI 的 libraries 資料夾裡面,Patch 檔案 --- simplehtmldom/simple_html_dom.php 2008-12-15 02:56:56.000000000 +0800 +++ application/libraries/Simple_html_dom.php 2010-09-29 14:09:11.000000000 +0800 @@ -1,4 +1,4 @@ -<?php +<?php if ( ! defined(\'BASEPATH\')) exit(\'No direct script access allowed\'); /******************************************************************************* Version: 1.11 ($Rev: 175 $) Website: http://sourceforge.net/projects/simplehtmldom/ @@ -30,56 +30,6 @@ define(\'HDOM_INFO_OUTER\', 6); define(\'HDOM_INFO_ENDSPACE\',7); -// helper functions -// ----------------------------------------------------------------------------- -// get html dom form file -function file_get_html() { - $dom = new simple_html_dom; - $args = func_get_args(); - $dom->load(call_user_func_array(\'file_get_contents\', $args), true); - return $dom; -} - -// get html dom form string -function str_get_html($str, $lowercase=true) { - $dom = new simple_html_dom; - $dom->load($str, $lowercase); - return $dom; -} - -// dump html dom tree -function dump_html_tree($node, $show_attr=true, $deep=0) { - $lead = str_repeat(\' \', $deep); - echo $lead.$node->tag; - if ($show_attr && count($node->attr)>0) { - echo \'(\'; - foreach($node->attr as $k=>$v) - echo "[$k]=>\\"".$node->$k.\'", \'; - echo \')\'; - } - echo "\\n"; - - foreach($node->nodes as $c) - dump_html_tree($c, $show_attr, $deep+1); -} - -// get dom form file (deprecated) -function file_get_dom() { - $dom = new simple_html_dom; - $args = func_get_args(); - $dom->load(call_user_func_array(\'file_get_contents\', $args), true); - return $dom; -} - -// get dom form string (deprecated) -function str_get_dom($str, $lowercase=true) { - $dom = new simple_html_dom; - $dom->load($str, $lowercase); - return $dom; -} - -// simple html dom node -// ----------------------------------------------------------------------------- class simple_html_dom_node { public $nodetype = HDOM_TYPE_TEXT; public $tag = \'text\'; @@ -476,9 +426,8 @@ function previousSibling() {return $this->prev_sibling();} } -// simple html dom parser -// ----------------------------------------------------------------------------- -class simple_html_dom { +class Simple_html_dom +{ public $root = null; public $nodes = array(); public $callback = null; @@ -515,13 +464,60 @@ $this->load_file($str); else $this->load($str); - } + } } function __destruct() { $this->clear(); } + // get html dom form file + function file_get_html() + { + $args = func_get_args(); + $this->load(call_user_func_array(\'file_get_contents\', $args), true); + return $this; + } + + // get html dom form string + function str_get_html($str, $lowercase=true) + { + $this->load($str, $lowercase); + return $this; + } + + // dump html dom tree + function dump_html_tree($node, $show_attr=true, $deep=0) + { + $lead = str_repeat(\' \', $deep); + echo $lead.$node->tag; + if ($show_attr && count($node->attr)>0) { + echo \'(\'; + foreach($node->attr as $k=>$v) + echo "[$k]=>\\"".$node->$k.\'", \'; + echo \')\'; + } + echo "\\n"; + + foreach($node->nodes as $c) + $this->dump_html_tree($c, $show_attr, $deep+1); + } + + // get dom form file (deprecated) + function file_get_dom() + { + $args = func_get_args(); + $this->load(call_user_func_array(\'file_get_contents\', $args), true); + return $this; + } + + // get dom form string (deprecated) + function str_get_dom($str, $lowercase=true) + { + $this->load($str, $lowercase); + return $this; + } + // load html from string function load($str, $lowercase=true) { // prepare @@ -971,5 +967,4 @@ function getElementByTagName($name) {return $this->find($name, 0);} function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} function loadFile() {$args = func_get_args();$this->load(call_user_func_array(\'file_get_contents\', $args), true);} -} -?> \\ No newline at end of file +} \\ No newline at end of file 在 CI Controller 底下使用方法: $this->load->library("simple_html_dom"); $box_url = "http://mlb.mlb.com/news/boxscore.jsp?gid=2010_09_28_phimlb_wasmlb_1"; /* load url */ $dom = $this->simple_html_dom->file_get_dom($box_url); $result = $dom->find(\'div#datenav\'); foreach($result as $v) { preg_match_all("/<option\\s+value=\\"([^>]+)\\">/",$v->outertext, $team_game); print_r($team_game); } |
廣利不動產-板橋在地生根最實在--新板特區指名度最高、值得您信賴的好房仲 完整房訊,房屋、店面熱門精選物件,廣利不動產 優質仲介,房屋租賃、買賣資訊透明,交易真安心! 廣利不動產-新板特區指名度最高、值得您信賴的好房仲 您的托付,廣利用心為您服務 |