在向百度站长平台提交MIP (Mobile Instant Page - 移动网页加速器) 网址的时候,我们采用了《百度MIP版本链接的批量提交》一文中的办法,这样确实可以定时自动提交,不过要整理出需要提交的网址文本这个过程很耗时,特别是我们一些站点的网址数量庞大,用浏览器一页一页访问sitemap页面、保存、合并、替换、上传等每个环节都需要手工操作并苦苦等待。
今天下午干脆花了一些时间来编写了一个PHP程序,设置一些参数后,自动读取预设的sitemap网址、下载数据并进行替换、合并、保存到指定文件名下,整个过程无需手工操作,即使读取sitemap网址依然比较慢,但已经大大简化了操作、提高了效率。
程序源代码如下(分为MediaWiki版本和Drupal版本,两者的sitemap格式稍有不同):
适合MediaWiki生成sitemap的源文件:mediawiki_url_from_xml_to_txt.php
<?php
/*
* convert mediawiki url from xmlsitemap to text format
* jamesqi 2017-8-30
*
*/
// please set below:
$input_xmlsitemap_url = '
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-0.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-1.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-2.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-3.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-4.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-5.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-6.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_0-7.xml
https://tw.18dao.net/sitemap-tw18daonet-jingle-NS_14-0.xml
';
$output_txt_file_name = 'tw.mip.18dao.net.url.txt';
$domain_input = 'tw.18dao.net';
$domain_output = 'tw.mip.18dao.net';
// please set above
// do not change below
function xmlsitemap_to_text($input,$domain_input,$domain_output) {
$output = $input;
$output = str_replace("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n","",$output);
$output = str_replace("\n","",$output);
$output = str_replace(" \n","",$output);
$output = str_replace(" \n","",$output);
$output = str_replace("\n","",$output);
$pattern = "/ ([^<]*)<\/lastmod>\n/";
$replace = "";
$output = preg_replace($pattern,$replace,$output);
$pattern = "/ ([^<]*)<\/priority>\n/";
$replace = "";
$output = preg_replace($pattern,$replace,$output);
$output = str_replace(" https://$domain_input","https://$domain_output",$output);
$output = str_replace("\n","\n",$output);
return $output;
}
print "programe start\n";
print "input_xmlsitemap_url = $input_xmlsitemap_url\n";
print "output_txt_file_name = $output_txt_file_name\n";
print "domain_input = $domain_input\n";
print "domain_output = $domain_output\n";
if (substr($input_xmlsitemap_url,0,1) == "\n") $input_xmlsitemap_url = substr($input_xmlsitemap_url,1);
if (substr($input_xmlsitemap_url,-1) == "\n") $input_xmlsitemap_url = substr($input_xmlsitemap_url,0,-1);
$input_xmlsitemap_url_array = explode("\n",$input_xmlsitemap_url);
$input_xmlsitemap_url_array_count = count($input_xmlsitemap_url_array);
print "input_xmlsitemap_url_array_count = $input_xmlsitemap_url_array_count lines\n";
$input_xmlsitemap_url_count = 0;
$output_txt = '';
$output_txt_length = 0;
$output_txt_count = 0;
print_r($input_xmlsitemap_url_array);
foreach ($input_xmlsitemap_url_array as $input_xmlsitemap_url_key=>$input_xmlsitemap_url_value) {
print "\n=======================\n\n";
print "input_xmlsitemap_url_key = $input_xmlsitemap_url_key\n";
print "input_xmlsitemap_url_value = $input_xmlsitemap_url_value\n";
if ($input_xmlsitemap_url_value == "" || $input_xmlsitemap_url_value == "\n") {
print "skip this null line\n";
} else {
$input_xmlsitemap_url_count++;
print "input_xmlsitemap_url_count = $input_xmlsitemap_url_count\n";
$input_xmlsitemap_content = file_get_contents($input_xmlsitemap_url_value);
$input_xmlsitemap_content_length = strlen($input_xmlsitemap_content);
print "input_xmlsitemap_content_length = $input_xmlsitemap_content_length bytes\n";
$output_text_content = xmlsitemap_to_text($input_xmlsitemap_content,$domain_input,$domain_output);
$output_text_content_length = strlen($output_text_content);
print "output_text_content_length = $output_text_content_length bytes\n";
$output_text_content_array = explode("\n",$output_text_content);
$output_text_content_array_count = count($output_text_content_array);
print "output_text_content_array_count = $output_text_content_array_count lines\n";
$output_txt .= $output_text_content;
$output_txt_length = $output_txt_length + $output_text_content_length;
$output_txt_count = $output_txt_count + $output_text_content_array_count;
}
}
print "\n=======================\n\n";
print "output_txt_length = $output_txt_length bytes\n";
print "output_txt_count = $output_txt_count lines\n";
$output_txt_file = fopen("$output_txt_file_name", "w") or die("Unable to open file!");
fwrite($output_txt_file, $output_txt);
fclose($output_txt_file);
print "programe end\n";
?>
适合Drupal生成sitemap的源文件:drupal_url_from_xml_to_txt.php
<?php
/*
* convert drupal url from xmlsitemap to text format
* jamesqi 2017-8-30
*
*/
// please set below:
$input_xmlsitemap_url = '
https://114.mingluji.com/sitemap.xml?page=1
https://114.mingluji.com/sitemap.xml?page=2
https://114.mingluji.com/sitemap.xml?page=3
https://114.mingluji.com/sitemap.xml?page=4
';
$output_txt_file_name = '114.mingluji.com.url.txt';
//$domain_input = 'tw.18dao.net';
//$domain_output = 'tw.mip.18dao.net';
// please set above
// do not change below
function xmlsitemap_to_text($input) {//,$domain_input,$domain_output
$output = $input;
$output = str_replace("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n","",$output);
$output = str_replace("\n","",$output);
$output = str_replace("\n","",$output);
$pattern = "/<\?xml-stylesheet([^>]*)\?>\n/";
$replace = "";
$output = preg_replace($pattern,$replace,$output);
$pattern = "/\t]*)\/>\n/";
$replace = "";
$output = preg_replace($pattern,$replace,$output);
$pattern = "/([^<]*)<\/lastmod>/";
$replace = "";
$output = preg_replace($pattern,$replace,$output);
$pattern = "/([^<]*)<\/changefreq>/";
$replace = "";
$output = preg_replace($pattern,$replace,$output);
$pattern = "/([^<]*)<\/priority>/";
$replace = "";
$output = preg_replace($pattern,$replace,$output);
$output = str_replace("","",$output);
$output = str_replace("","",$output);
$output = str_replace("","",$output);
$output = str_replace("\n","",$output);
return $output;
}
print "programe start\n";
print "input_xmlsitemap_url = $input_xmlsitemap_url\n";
print "output_txt_file_name = $output_txt_file_name\n";
print "domain_input = $domain_input\n";
print "domain_output = $domain_output\n";
if (substr($input_xmlsitemap_url,0,1) == "\n") $input_xmlsitemap_url = substr($input_xmlsitemap_url,1);
if (substr($input_xmlsitemap_url,-1) == "\n") $input_xmlsitemap_url = substr($input_xmlsitemap_url,0,-1);
$input_xmlsitemap_url_array = explode("\n",$input_xmlsitemap_url);
$input_xmlsitemap_url_array_count = count($input_xmlsitemap_url_array);
print "input_xmlsitemap_url_array_count = $input_xmlsitemap_url_array_count lines\n";
$input_xmlsitemap_url_count = 0;
$output_txt = '';
$output_txt_length = 0;
$output_txt_count = 0;
print_r($input_xmlsitemap_url_array);
foreach ($input_xmlsitemap_url_array as $input_xmlsitemap_url_key=>$input_xmlsitemap_url_value) {
print "\n=======================\n\n";
print "input_xmlsitemap_url_key = $input_xmlsitemap_url_key\n";
print "input_xmlsitemap_url_value = $input_xmlsitemap_url_value\n";
if ($input_xmlsitemap_url_value == "" || $input_xmlsitemap_url_value == "\n") {
print "skip this null line\n";
} else {
$input_xmlsitemap_url_count++;
print "input_xmlsitemap_url_count = $input_xmlsitemap_url_count\n";
$input_xmlsitemap_content = file_get_contents($input_xmlsitemap_url_value);
$input_xmlsitemap_content_length = strlen($input_xmlsitemap_content);
print "input_xmlsitemap_content_length = $input_xmlsitemap_content_length bytes\n";
$output_text_content = xmlsitemap_to_text($input_xmlsitemap_content);
$output_text_content_length = strlen($output_text_content);
print "output_text_content_length = $output_text_content_length bytes\n";
$output_text_content_array = explode("\n",$output_text_content);
$output_text_content_array_count = count($output_text_content_array);
print "output_text_content_array_count = $output_text_content_array_count lines\n";
$output_txt .= $output_text_content;
$output_txt_length = $output_txt_length + $output_text_content_length;
$output_txt_count = $output_txt_count + $output_text_content_array_count;
}
}
print "\n=======================\n\n";
print "output_txt_length = $output_txt_length bytes\n";
print "output_txt_count = $output_txt_count lines\n";
$output_txt_file = fopen("$output_txt_file_name", "w") or die("Unable to open file!");
fwrite($output_txt_file, $output_txt);
fclose($output_txt_file);
print "programe end\n";
?>
上面程序运行起来都有一步一步的提示,可以看到运行结果,运行结束后打开生成的纯网址文本文件检查,如果发现哪里异常,也可能要稍微调整一下程序,来适应稍有不同的sitemap格式。
其它的设置cron定时运行办法还是与前面说的博文中一样,生成的日志文件也都有。
能够想办法用程序来代替手工重复操作的尽量编程解决,一次性编写、调试麻烦一些,但后面可以节约大量时间精力,而且也不会出差错。✌
评论