偶尔会发现各Drupal系列网站的sitemap.xml丢失的情况,应该是在运行cron阶段把老的sitemap.xml删除了,但在生成新的sitemap.xml的时候因为网站数据量大导致mysql进程被杀死或者其它超时等原因重新生成失败,我们管理员一般不会去特意检查这个sitemap是否存在,而搜索引擎会一遍一遍经常检查,遇到不存在报错的情况可能会存在相当长的时间。(解决办法:可以在Sitemap配置中的ADVANCED SETTINGS里面把Disable cron generation of sitemap files.勾选,或者运行drush vset xmlsitemap_disable_cron_regeneration 1)
今天想到用以前编写的PHP巡检程序来检查,在博客中找到以前记录的需求《网站监控程序的开发需求》、《关于网站及服务器状态监控的笔记》,是2013年记录的,后来2014年自己编写了程序,虽然不是很完善,但也基本可用,已经进行了几年的站点巡检工作,这次干脆把源代码也贴出来:
<?php
/*
* 网站监控程序
* James Qi
*
* 运行办法:php main.php 参数1 参数2
* 参数1:必填,point_id,其中point_id是检测点参数,表示从这个程序检测哪些系列
* 参数2:可选,page_name,需要巡检的页面名称,例如sitemap.xml/robots.txt等,如果不填写则默认为空
*
* 备忘记录:
* 2013年11月:做了初步需求规划,同事进行了初次开发,用web界面实现了部分巡检,但还存在一些问题,未投入实际使用,先用外部免费监控服务
* 2014年12月:做了基本的站点循环、页面检查、邮件报警,数据库已经建表但尚未启用,目前只用了PHP程序来进行设置、运行,也没有WEB界面
*
* 工作列表:
* 1、分为子程序:main.php, groups.inc.php, group.inc.php, site.inc.php, page.inc.php, robots.php, sitemap.php等;
* 2、数据库记录,考虑设置web界面;
* 3、需要巡检的站点再仔细检查、添加;
* 4、巡检模式:是否检查手机版、是否检查多语言、是否检查首页、是否检查分类页、是否检查列表页、是否检查内页等;
* 5、智能判断等,详见http://jamesqi.com/node/442 《网站监控程序的开发需求》。
*/
if (isset($argv[1])) {
$point_id = $argv[1];
log_print("point_id = $point_id\n");
} else {
print "no arg, please add point_id to run this programe like this: 'php main.php point_id'.\n";
exit;
}
if (isset($argv[2])) {
$page_name = $argv[2];
log_print("page_name = $page_name\n");
} else {
$page_name = '';
}
//system settings
$should_include = 'Drupal 7,Drupal 6,Drupal,nokia_mobile,MediaWiki 1.23.5,MediaWiki 1.16.0,MediaWiki,sitemap,robots';
$should_not_include = 'Fatal error,Warning:,Notice:,403 Forbidden,Apache 2 Test Page,undefined,Error,';
$should_not_include .= 'Too many connections,PDOException,file_exists(),mysql_connect,Cannot contact the database server,A database query error has occurred,This wiki has a problem,Error:,HTTP request,Internal error,';
$should_not_include .= '报错:,出错:,警告:';
//$alarm = TRUE;
//$email = 'myemail@example.com';
$timeout = 60;//访问超时门限,单位秒,超过则报警
$retry = 3;//重试最大次数
$delay = 30;//重试之间的延迟时间,单位秒
$log = '';
log_print("should_include = $should_include\n");
log_print("should_not_include = $should_not_include\n");
log_print("timeout = $timeout, retry = $retry, delay = $delay\n");
$count_group = 0;
$count_site = 0;
$count_page = 0;
$count_retry = 0;
$count_error = 0;
$count_header = 0;
$count_body = 0;
$time_header = 0;
$time_body = 0;
//monitor points settings
$point = array();
$point['china_0'] = array(
'point_id' => 'china_0',
'point_name' => 'China 0',
'groups' => 'mingluji_gongshang,mingluji_hangye',
'email' => 'myemail@example.com',
'check_mode' => 'full'
);
$point['china_1'] = array(
'point_id' => 'china_1',
'point_name' => 'China 1',
'groups' => 'jamesqi,jamesqi_wiki',
'email' => 'myemail@example.com',
'check_mode' => 'full'
);
$point['foreign_0'] = array(
'point_id' => 'foreign_0',
'point_name' => 'Foreign 0',
'groups' => 'en18daonet,bizdirlib_usa',
'email' => 'myemail@example.com',
'check_mode' => 'full'
);
$point['foreign_1'] = array(
'point_id' => 'foreign_1',
'point_name' => 'Foreign 1',
'groups' => 'chahaoba_wiki_language,youbianku_wiki_language',
'email' => 'myemail@example.com',
'check_mode' => 'full'
);
$point['test'] = array(
'point_id' => 'test',
'point_name' => 'Test',
'groups' => 'test1,test2',
'email' => 'myemail@example.com',
'check_mode' => 'full'
);
$groups = $point[$point_id]['groups'];
$email = $point[$point_id]['email'];
$line = "groups = $groups, email = $email\n";
log_print($line);
//website group settings
$group = array();
$group['mingluji_gongshang'] = array(
'group_id' => 'mingluji_gongshang',
'group_name' => 'mingluji new gongshang',
'base_url' => 'gongshang.mingluji.com',
'sub_sites' => ',beijing,tianjin,hebei,neimenggu,shanxi,shanghai,anhui,jiangsu,zhejiang',
'sub_mode' => 'sub_path',
'cms' => 'Drupal 7',
'mobile_mode' => 'sub_domain',
'languages' => 'zh-hans',
'home_site' => 'gongshang.mingluji.com',
'example_site' => 'gongshang.mingluji.com/xizang',
'check_mode' => 'full'
);
$group['mingluji_hangye'] = array(
'group_id' => 'mingluji_hangye',
'group_name' => 'mingluji hangye',
'base_url' => 'hangye.mingluji.com',
'sub_sites' => ',anquan,bangongwenjiao,baojian,baozhuang,caiwu,canyinyule,dianchi',
'sub_mode' => 'sub_path',
'cms' => 'Drupal 7',
'mobile_mode' => 'sub_domain',
'languages' => 'zh-hans',
'home_site' => 'ditu.mingluji.com',
'example_site' => 'ditu.mingluji.com/xizang',
'check_mode' => 'full'
);
$group['jamesqi'] = array(
'group_id' => 'jamesqi',
'group_name' => 'jamesqi',
'base_url' => 'jamesqi.com',
'sub_sites' => ',m,jiapu',
'sub_mode' => 'sub_domain',
'cms' => 'Drupal 7',
'mobile_mode' => 'sub_domain',
'languages' => 'zh-hans',
'home_site' => 'jamesqi.com',
'example-site' => 'jamesqi.com',
'check_mode' => 'full'
);
$group['jamesqi_wiki'] = array(
'group_id' => 'jamesqi_wiki',
'group_name' => 'jamesqi wiki sites',
'base_url' => 'jamesqi.com',
'sub_sites' => 'www,mobile',
'sub_mode' => 'sub_domain',
'cms' => 'MediaWiki',
'mobile_mode' => 'sub_domain',
'languages' => 'zh-cn',
'home_site' => 'www.jamesqi.com',
'example-site' => 'www.jamesqi.com',
'check_mode' => 'full'
);
$group['en18daonet'] = array(
'group_id' => 'en18daonet',
'group_name' => 'en18daonet wiki',
'base_url' => '18dao.net',
'sub_sites' => 'en',
'sub_mode' => 'sub_domain',
'cms' => 'MediaWiki',
'mobile_mode' => 'm.',
'languages' => 'en',
'home_site' => 'en.18dao.net',
'example-site' => 'en.18dao.net',
'check_mode' => 'full'
);
$group['bizdirlib_usa'] = array(
'group_id' => 'bizdirlib_usa',
'group_name' => 'bizdirlib_usa',
'base_url' => 'bizdirlib.com',
'sub_sites' => 'usa,ak,al,ar,az,ca,co,ct,dc,de,fl,ga,hi,ia,id,il,in',
'sub_mode' => 'sub_domain',
'cms' => 'Drupal 6',
'mobile_mode' => '/m',
'languages' => '49',
'home_site' => 'usa.bizdirlib.com',
'example-site' => 'ak.bizdirlib.com',
'check_mode' => 'full'
);
$group['chahaoba_wiki_language'] = array(
'group_id' => 'chahaoba_wiki_language',
'group_name' => 'chahaoba_wiki_language',
'base_url' => 'chahaoba.com',
'sub_sites' => 'ar,de,en,es,fr,it,ja,ko,pt,ru',
'sub_mode' => 'sub_domain',
'cms' => 'MediaWiki',
'mobile_mode' => '.m.',
'languages' => '10',
'home_site' => 'en.chahaoba.com',
'example-site' => 'en.chahaoba.com',
'check_mode' => 'full'
);
$group['youbianku_wiki_language'] = array(
'group_id' => 'youbianku_wiki_language',
'group_name' => 'youbianku_wiki_language',
'base_url' => 'youbianku.com',
'sub_sites' => 'ar,de,en,es,fr,it,ja,ko,pt,ru',
'sub_mode' => 'sub_domain',
'cms' => 'MediaWiki',
'mobile_mode' => '.m.',
'languages' => '10',
'home_site' => 'en.youbianku.com',
'example-site' => 'en.youbianku.com',
'check_mode' => 'full'
);
//print_r($group);
//functions
function sub_path($base_url, $sub_sites) {
$sites = '';
foreach (explode(',',$sub_sites) as $value) {
$value = trim($value);
$sites .= ",$base_url/$value";
}
$sites = substr($sites,1);
return $sites;
}
function sub_domain($base_url, $sub_sites) {
$sites = '';
foreach (explode(',',$sub_sites) as $value) {
$value = trim($value);
if ( $value =='') {
$sites .= ",$base_url";
} else {
$sites .= ",$value.$base_url";
}
}
$sites = substr($sites,1);
return $sites;
}
function check_sites($sites) {
foreach (explode(',',$sites) as $value) {
$value = trim($value);
//print "site = $value\n";
check_site($value);
}
}
function check_site($site) {
global $log;
global $count_site;
global $page_name;
$count_site ++;
$line = "------------------------------------\n count_site = $count_site, site = $site\n";
log_print($line);
if ($page_name != '') {
$page = "$site/$page_name";
} else {
$page = $site;
}
check_page($page);
}
function check_page($page) {
global $log;
global $count_page;
global $retry;
global $delay;
global $count_retry;
global $count_error;
global $count_header;
global $count_body;
global $time_header;
global $time_body;
$count_page ++;
$url = "http://$page";
for ( $count_retry = 0; $count_retry <= $retry; $count_retry ++ ) {
$time_start = microtime(TRUE);
$header = get_headers($url, 1);
$time_end = microtime(TRUE);
$time_long = $time_end - $time_start;
$time_long = round($time_long,3);
$count_header++;
$time_header = $time_header + $time_long;
$line = 'header length = '.strlen( implode( ',', $header ) )." bytes, time = $time_long seconds\n";
log_print($line);
print_r ($header);
if ( $header == FALSE ) {
$line = "count_retry = $count_retry\n";
log_print($line);
sleep ( $delay );
} else {
break;
}
}
if ( $header == FALSE ) {
$line = "page = $page, code = get headers FALSE, count_retry = $count_retry\n";
log_print($line);
print_r($header);
alarm ($page,"get headers FALSE, retry $count_retry times","get headers FALSE, retry $count_retry times");
} elseif ( $header[0] == 'HTTP/1.1 200 OK' || $header[1] == 'HTTP/1.1 200 OK' || $header[0] == 'HTTP/1.0 200 OK' || $header[1] == 'HTTP/1.0 200 OK' ) {
$time_start = microtime(TRUE);
$content = file_get_contents($url);
//$header = $http_response_header;
$time_end = microtime(TRUE);
$time_long = $time_end - $time_start;
$time_long = round($time_long,3);
$count_body++;
$time_body = $time_body + $time_long;
$line = 'content length = '.strlen($content)." bytes, time = $time_long seconds\n";
log_print($line);
$title = get_title($content);
log_print("title = $title\n");
global $timeout;
if ($time_long > $timeout) {
alarm ($page,"time too long: $time_long > $timeout",array_to_string($header));
}
if (!should_include($content)) {
alarm ($page,'should_include not found',$content);
} else {
$line = "$page should_include found\n";
log_print($line);
}
if (!should_not_include($content)) {
alarm ($page,'should_not_include found',$content);
} else {
$line = "$page should_not_include not found\n";
log_print($line);
}
} else {
$line = "page = $page, code = not 200 OK\n";
$count_error++;
log_print($line);
print_r($header);
alarm ($page,$header[0],array_to_string($header));
}
}
function should_include($content) {
global $should_include;
global $count_error;
$token = strtok($should_include, ',');
while ($token !== false) {
$token = trim($token);
print "token=$token,";
if ( strpos($content,$token) == false ) {
$token = strtok(',');
} else {
log_print("found = $token\n");
return TRUE;//found
}
}
print "\n";
$count_error++;
return FALSE;//not found
}
function should_not_include($content) {
global $should_not_include;
global $line;//?
global $count_error;
$token = strtok($should_not_include, ',');
while ($token !== false) {
$token = trim($token);
print "token=$token,";
if ( strpos($content,$token) == false ) {
$token = strtok(',');
} else {
$line = "found = $token\n";
log_print($line);
$count_error++;
return FALSE;//found
}
}
print "\n";
return TRUE;//not found
}
function get_title($content) {
$start = strpos( $content, '<title>' );
$end = strpos( $content, '</title>' );
$title = substr($content, $start + 7, $end - $start - 7 );
return $title;
}
function alarm($page,$code,$message) {
global $log;
global $email;
$alarm = "ALARM: page = $page, code = $code\n";
log_print($alarm);
//print_r($header);
$mailbox = $email;
$subject = $alarm;
sendmail($mailbox, $subject, $message);
}
function sendmail($mailbox, $subject, $message) {
mail($mailbox, $subject, $message);
}
function log_print($string) {
global $log;
$log .= $string;
print $string;
}
function array_to_string($array) {
$string = '';
foreach ($array as $key => $value) {
$string .= "[$key] => $value,\n";
}
return $string;
}
//main loop
$time_from = time();
//print "groups = $groups\n";
$groups_array = explode( ',', $groups );
//print_r ($groups_array);
foreach ( $groups_array as $value ) {
//print "here!\n";
//print array_key_exists( $value, $group );
if ( !array_key_exists( $value, $group ) ) {
$line = "group $value not found\n";
log_print($line);
} else {
$series = $group[$value];
$group_id = $series['group_id'];
$sub_mode = $series['sub_mode'];
$base_url = $series['base_url'];
$sub_sites = $series['sub_sites'];
$count_group ++;
$line = "===========================================\n";
$line .= "count_group = $count_group, group_id = $group_id, sub_mode = $sub_mode, base_url = $base_url, sub_sites = $sub_sites\n";
log_print($line);
switch ($sub_mode) {
case 'sub_path':
$sites = sub_path( $base_url, $sub_sites );
break;
case 'sub_domain':
$sites = sub_domain( $base_url, $sub_sites );
break;
default:
log_print("sub_mode $sub_mode error\n");
}
$line = "sites = $sites\n";
log_print($line);
check_sites($sites);
}
}
$time_to = time();
$time_total = $time_to - $time_from;
$time_average = round($time_total/$count_site,3);
$time_total = gmstrftime('%H:%M:%S', $time_total);
$time_header_average = round($time_header/$count_header,3);
$time_body_average = round($time_body/$count_body,3);
$mailbox = $email;
$line = "===========================================\n";
log_print($line);
$subject = "point $point_id: $count_group groups $count_site sites $count_error errors";
$subject .= ", total: $time_total average: $time_average seconds, header $time_header_average / body $time_body_average";
log_print("$subject\n");
$line = "from ".date('Y-m-d H:i:s',$time_from)." to ".date('Y-m-d H:i:s',$time_to)."\n";
$line .= "time_header/count_header = $time_header/$count_header, time_body/count_body = $time_body/$count_body\n";
log_print($line);
$message = $log;
sendmail($mailbox, $subject, $message);
print "$log";
?>
再在Linux的/etc/crontab中加入定时运行,国内服务器运行检查国内站点的:
00 01 * * * root php /path/main.php china_0 30 04 * * * root php /path/main.php china_1
国外服务器运行检查国外站点的:
00 02 * * * root php /path/main.php foreign_0 30 05 * * * root php /path/main.php foreign_1
这样每天凌晨自动运行巡检并发送统计结果的邮件到指定信箱。
上面这是检查网站首页的情况,如果要检查sitemap.xml,可以在/etc/crontab中加入这样的:
00 01 * * * root php /path/main.php china_0 sitemap.xml
后面还可以专门针对sitemap.xml的检查对main.php进行修改。
评论