代码出处:jUnion
  适用平台:Windows, Linux(Ubuntu),php-5.2.5+,Apache
  功能:抓取整个站点的图片,暂无借助php的curl插件开发, 后期完善
  配置:config目录下      domain_name:域名(默认:bizhibar.com)      request_site:网站网址(默认:http://www.bizhibar.com/)      request_url:从网站的哪个页面开始(默认:http://www.bizhibar.com/)      accept_type: 图片类型(默认:gif, bmp, png, ico,  jpg, jpeg)      save_path:图片保存路径(默认:savefiles/)      partition_name:图片保存目录名称前缀(默认:img_)      dir_file_limit: 每个目录容许多少个文件(默认:100)      serialize_img_size: 当读取了多少个图片地址才缓存到cache目录下的accompImg文件当中,下次继续抓取的时候会忽略这些地址。(默认:30)      serialize_url_size:与serialize_url_size一样,已读取多少个链接地址才缓存到cache目录 下的overURL,下次继续抓取的时候忽略这些地址。(默认:10)
  说明:欢迎诸君批评指教,有任何新问题或者需要改进的地方,请您反馈给我
 
- 
set_time_limit(0);
 
- require dirname(__FILE__).DIRECTORY_SEPARATOR.'include'.DIRECTORY_SEPARATOR.'Capture.const.php';
 
- require __Home__.'include'.__Os__.'Capture.class.php';
 
- 
 - $_cfg = array(
 
- 	'site' => __Home__.'config'.__Os__.'capture.site.php',
 
- 	'preg' => __Home__.'config'.__Os__.'capture.preg.php',
 
- 	'accompImg' => __Home__.'cache'.__Os__.'accompImg',
 
- 	'overURL'   => __Home__.'cache'.__Os__.'overURL'
 
- );
 
- 
 - $_parse = new Capture( $_cfg );
 
- $_parse->parseQuestUrl();
 
- 
 - ?>
 
  
复制代码
         		            	
            	            	
            	            	
            
- 
/**
 
-  * The main class
 
-  * @author pankai
 
-  * @date 2013-08-10
 
-  */
 
- class Capture {
 
- 	private static $_Config = array();
 
- 	
 
- 	private static $_CapSite = NULL;
 
- 	private static $_CapPreg = NULL;
 
- 	
 
- 	private static $_overURL = array();
 
- 	
 
- 	private $_mark = FALSE;
 
- 	private static $_markTime = 1;
 
- 	/**
 
- 	 * initialize the main class: Capture
 
- 	 * @param $_cfg array
 
- 	 */
 
- 	public function __construct( &$_cfg ) {
 
- 		self::$_Config = &$_cfg;
 
- 		
 
- 		self::$_CapSite = require $_cfg['site'];
 
- 		self::$_CapPreg = require $_cfg['preg'];
 
- 		
 
- 		foreach( self::$_CapPreg as $_key => $_value ) {
 
- 			self::$_CapPreg[$_key] = str_replace( '_request_site', self::$_CapSite['request_site'], $_value );
 
- 		}
 
- 		
 
- 		self::import( 'file.OperateFile' );
 
- 		if( file_exists( $_cfg['overURL'] ) && filesize( $_cfg['overURL'] ) > 0 ) {
 
- 			$_contents = OperateFile::readText( $_cfg['overURL'], filesize( $_cfg['overURL'] ) );
 
- 			self::$_overURL = unserialize( $_contents );
 
- 		}
 
- 		
 
- 		self::import('pivotal.Pivotal');
 
- 		if( file_exists( $_cfg['accompImg'] ) && filesize( $_cfg['accompImg'] ) > 0 ) {
 
- 			$_contents = OperateFile::readText( $_cfg['accompImg'], filesize( $_cfg['accompImg'] ) );
 
- 			Pivotal::$_accompImg = unserialize( $_contents );
 
- 		}
 
- 		
 
- 	}
 
- 	/**
 
- 	 * load class, follow Java pragrammer(package): import com.jUnion.Capture
 
- 	 * @param $_class
 
- 	 */
 
- 	public static function import( $_class ) {
 
- 		require_once __Home__.'include'.__Os__.str_replace( '.', __Os__, $_class ).'.class.php';
 
- 	}
 
- 	
 
- 	/**
 
- 	 * create an instance of Pivotal class
 
- 	 * @param $_source
 
- 	 */
 
- 	private function getCapInstance( &$_source ) {
 
- 		$this->_mark = FALSE;
 
- 		
 
- 		$_Captal = new Pivotal( self::$_Config, $_source );
 
- 		$_tagA = $_Captal->parseUrl();
 
- 		
 
- 		$this->_mark = TRUE;
 
- 		
 
- 		return $_tagA;
 
- 	}
 
- 	
 
- 	/**
 
- 	 * go forward one by one
 
- 	 * @param $_tagArr
 
- 	 */
 
- 	private function roundTagA( &$_tagArr ) {
 
- 		if( $_tagArr == NULL ) {
 
- 			return;
 
- 		}
 
- 		$_tagArrLength = count( $_tagArr );
 
- 		for( $i = 0; $i 			if( is_array( $_tagArr[ $i ] ) ) {
 
- 				$this->roundTagA( $_tagArr[ $i ] );  
 
- 			}
 
- 			else {
 
- 				if( stripos( $_tagArr[$i], self::$_CapSite['domain_name'] )
 
- 					=== FALSE ) {
 
- 						continue;
 
- 					}
 
- 				if( in_array( $_tagArr[$i], self::$_overURL ) ) {
 
- 					continue;
 
- 				}
 
- 				self::$_overURL[] = $_tagArr[$i];
 
- 				if( count( self::$_overURL ) % self::$_CapSite['serialize_url_size'] == 0 ) {
 
- 					OperateFile::setText( self::$_Config['overURL'], serialize( self::$_overURL ) );
 
- 				}
 
- 				do {
 
- 					$_tagA = $this->getCapInstance( Http::get( $_tagArr[$i] ) );
 
- 					sleep( self::$_CapSite['preform_page_time'] * self::$_markTime );
 
- 					if( $this->_mark === TRUE ) {
 
- 						self::$_markTime = self::$_CapSite['preform_page_time'];
 
- 						break;
 
- 					}
 
- 					self::$_markTime *= 2;
 
- 				} while( true );
 
- 				/* parse the main page and return next page */
 
- 				$this->roundTagA( $_tagA );
 
- 			}
 
- 		}
 
- 	}
 
- 	
 
- 	public function parseQuestUrl() {
 
- 		self::import('http.Http');
 
- 		$_round_Arr = $this->getCapInstance( Http::get( self::$_CapSite['request_url'] ) );
 
- 		$this->roundTagA( $_round_Arr ); 
 
- 	}
 
- }
 
- ?>
 
  
复制代码
 
 |