我们一直很重视网站地图对搜索引擎的提交,以前的MediaWiki自带生成sitemap的程序,Drupal也有专门的第三方扩展XML Sitemap程序。
但Drupal的这个扩展只能对node, user, taxonomy term, menu等生成网站地图,也可以手工添加custom网址加入地图中,但却无法把Views批量做成的页面都加进去。这个问题以前不算很突出、很重要,因为主要页面都是node页面或者分类页面,但采取“在Drupal中直接导入、使用数据库”的办法以后,一个网站的主要页面基本上都是Views生成的,这时Drupal的xmlsitemap扩展程序就起不了很大作用了。
去年在尝试在Drupal中直接导入数据到数据库表的时候就考虑到这个问题,也有了一些思路,可以用Views本身来生成xmlsitemap,例如生成json数据,再用一个外部sitemap.php程序调用、呈现。但感觉麻烦了一些,如果直接用SQL语句查询数据库,可以省去做专门Views的过程。
最近我们在采取新办法搭建台湾《国语小字典》、《国语小辞典》、《国语大辞典》、《成语辞典》的时候,就专门花时间来研究这个,也算是找到了一个比较好的解决办法,办法是在网站更目录下创建一个sitemap.php,在.htaccess里面用重定向来设置sitemap访问的URL网址:
RewriteBase /
# robots.txt
RewriteCond %{REQUEST_URI} ^\/robots\.txt$
RewriteRule ^(.*)$ /robots.php [L]
# sitemap_xxx.xml
RewriteCond %{HTTP_HOST} ^zidian\.18dao\.net$
RewriteCond %{REQUEST_URI} ^\/sitemap_(danzi|bushou|bihua|bushoubihua)\.xml$
RewriteRule ^(.*)$ /sitemap.php [L]
RewriteCond %{HTTP_HOST} ^cidian\.18dao\.net$
RewriteCond %{REQUEST_URI} ^\/sitemap_(zici|bushou|bihua)\.xml$
RewriteRule ^(.*)$ /sitemap.php [L]
RewriteCond %{HTTP_HOST} ^dacidian\.18dao\.net$
RewriteCond %{REQUEST_URI} ^\/sitemap_(zici|bushou|bihua)\.xml$
RewriteRule ^(.*)$ /sitemap.php [L]
RewriteCond %{HTTP_HOST} ^chengyu\.18dao\.net$
RewriteCond %{REQUEST_URI} ^\/sitemap_(chengyu|shouzipinyin|shouzibihuashu|shouzi|weizi)\.xml$
RewriteRule ^(.*)$ /sitemap.php [L]
一个目录下的多站点设置时,可以像上面这样对不同的网站设置几个不同的网站地图。
而sitemap.php的源代码如下,一些注释就直接写在里面:
<?php
/*
* sitemap.php
* James Qi
* 2018-1-11
* example: https://cidian.18dao.net/sitemap_zici.xml?page=2
* modify .htaccess, example:
RewriteCond %{HTTP_HOST} ^cidian\.18dao\.net$
RewriteCond %{REQUEST_URI} ^\/sitemap_(zici|bushou|bihua)\.xml$
RewriteRule ^(.*)$ /sitemap.php [L]
* modify robots.txt, example:
Sitemap: https://cidian.18dao.net/sitemap_zici.xml
Sitemap: https://cidian.18dao.net/sitemap_bushou.xml
Sitemap: https://cidian.18dao.net/sitemap_bihua.xml
*/
# 设置sitemap的Content-Type为application/xml,符合规范
# 如果不设置就成了默认的text/xml,显示纯文本,虽然也可以用,但不规范
header("Content-Type: application/xml");
# 设置PHP参数
ini_set('memory_limit','512M');
ini_set('display_errors', 'On');
error_reporting(E_ALL);
# 读取网址中的参数
$http_host = $_SERVER['HTTP_HOST'];//example:cidian.18dao.net
$request_uri = $_SERVER['REQUEST_URI'];//example:/sitemap_zici.xml?page=2
$query_string = $_SERVER['QUERY_STRING'];//example:page=2
# 获取网址中的sitemap名称
$map_start = strpos($request_uri,'_');//example: 8
$map_end = strpos($request_uri,'.');//example: 13
$map = substr($request_uri, $map_start + 1, $map_end - $map_start - 1);//example: zici
if ($map == 'sitemap') {
print "no sitemap";
exit;
}
# 定义站点对应的数据库
$database_array = array(
'zidian.18dao.net' => 'net_18dao_zidian',
'cidian.18dao.net' => 'net_18dao_cidian',
'dacidian.18dao.net' => 'net_18dao_dacidian',
'chengyu.18dao.net' => 'net_18dao_chengyu',
);
$database = $database_array[$http_host];//example: 'net_18dao_cidian';
# sitemap默认参数,可以在单独的sitemap中定义覆盖
$lastmod_default = '2018-01-13T00:00:00Z';
$priority_default = '0.5';//0.0 - 1.0
$changefreq_default = 'weekly';//always, hourly, daily, weekly, monthly, yearly, never
$path_default = $map;//默认路径与map名称一致//example:'zici';
$url_per_page_default = 10000;//sitemap协议规定一个sitemap可以包含最多50000个网址
# 定义每个sitemap的参数,
# 以http_host和map名称为二维数组的两个参数
# 必填写项:定义table,field,
# 可选项(不填写就是用默认值):lastmod, priority, changefreq
$sitemap_array = array(
'zidian.18dao.net' => array(
'danzi' => array(
'table' => 'dict_mini',
'field' => 'danzi',
),
'bushou' => array(
'table' => 'dict_mini',
'field' => 'bushou',
'priority' => '0.6',
),
'bihua' => array(
'table' => 'dict_mini',
'field' => 'danzibihua',
'priority' => '0.6',
),
'bushoubihua' => array(
'table' => 'dict_mini',
'field' => 'bushoubihua',
'priority' => '0.6',
),
),
'cidian.18dao.net' => array(
'zici' => array(
'table' => 'dict_concised',
'field' => 'ziciming',
),
'bushou' => array(
'table' => 'dict_concised',
'field' => 'bushou',
'priority' => '0.6',
),
'bihua' => array(
'table' => 'dict_concised',
'field' => 'zongbihuashu',
'priority' => '0.6',
),
),
'dacidian.18dao.net' => array(
'zici' => array(
'table' => 'dict_revised',
'field' => 'ziciming',
),
'bushou' => array(
'table' => 'dict_revised',
'field' => 'bushouzi',
'priority' => '0.6',
),
'bihua' => array(
'table' => 'dict_revised',
'field' => 'zongbihuashu',
'priority' => '0.6',
),
),
'chengyu.18dao.net' => array(
'chengyu' => array(
'table' => 'dict_idioms',
'field' => 'chengyu',
),
'shouzipinyin' => array(
'table' => 'dict_idioms',
'field' => 'shouzipinyin',
'priority' => '0.6',
),
'shouzibihuashu' => array(
'table' => 'dict_idioms',
'field' => 'shouzibihuashu',
'priority' => '0.6',
),
'shouzi' => array(
'table' => 'dict_idioms',
'field' => 'shouzi',
'priority' => '0.7',
),
'weizi' => array(
'table' => 'dict_idioms',
'field' => 'weizi',
'priority' => '0.4',
'changefreq' => 'monthly',
'lastmod' => '2018-01-13T10:00:00Z',
),
),
);
# 从以上定义中获取实际sitemap的参数
$table = $sitemap_array[$http_host][$map]['table'];//example:'dict_concised';
$field = $sitemap_array[$http_host][$map]['field'];//example:'ziciming';
if (isset($sitemap_array[$http_host][$map]['priority'])) {
$priority = $sitemap_array[$http_host][$map]['priority'];
} else {
$priority = $priority_default;
}
if (isset($sitemap_array[$http_host][$map]['changefreq'])) {
$changefreq = $sitemap_array[$http_host][$map]['changefreq'];
} else {
$changefreq = $changefreq_default;
}
if (isset($sitemap_array[$http_host][$map]['lastmod'])) {
$lastmod = $sitemap_array[$http_host][$map]['lastmod'];
} else {
$lastmod = $lastmod_default;
}
if (isset($sitemap_array[$http_host][$map]['url_per_page'])) {
$url_per_page = $sitemap_array[$http_host][$map]['url_per_page'];
} else {
$url_per_page = $url_per_page_default;//default:10000
}
if (isset($sitemap_array[$http_host][$map]['path'])) {
$path = $sitemap_array[$http_host][$map]['path'];
} else {
$path = $path_default;//example:'zici';
}
# 数据库服务器连接参数
$serverName = '***';
$userName = '***';
$password = '***';
# 连接数据库
$link = mysqli_connect("$serverName","$userName","$password")
or die("unable to connect to msql server: " . mysql_error());
mysqli_select_db($link,"$database")
or die("unable to select database 'db': " . mysql_error());
# 准备输出内容
$output = '';
if ($query_string == NULL) { //index file or single file, example: https://cidian.18dao.net/sitemap_cidian.xml or https://cidian.18dao.net/sitemap_bihua.xml
$sql = "SELECT DISTINCT $field FROM $table WHERE not $field like '%gif%' and not $field like '%jpg%' and not $field = ''";
$result = mysqli_query($link,$sql);
$num_rows = $result->num_rows;
$pages = ceil($num_rows / $url_per_page);
if ($pages == 1) {//single file, example: https://cidian.18dao.net/sitemap_bihua.xml
$output .= map_start();
while ($row = $result->fetch_array()) {
$value = $row[0];
$value_urlencode = urlencode($value);
if (strpos($value,'&') == FALSE) {
$output .= "<url>";
$output .= "<loc>https://$http_host/$path/$value_urlencode</loc>";
$output .= "<lastmod>$lastmod</lastmod>";
$output .= "<changefreq>$changefreq</changefreq>";
$output .= "<priority>$priority</priority>";
$output .= "</url>\n";
}
}
$output .= map_end();
} else {//index file, example: https://cidian.18dao.net/sitemap_cidian.xml
$output .= index_start();
for ($i=1; $i<=$pages; $i++) {
$output .= "\t<sitemap>\n";
$output .= "\t\t<loc>https://$http_host$request_uri?page=$i</loc>\n";
$output .= "\t\t<lastmod>$lastmod</lastmod>\n";
$output .= "\t</sitemap>\n";
}
$output .= index_end();
}
} else {//paged file, example: https://cidian.18dao.net/sitemap_cidian.xml?page=2
$page = substr($query_string,5); //example: 2
$offset = ($page - 1) * $url_per_page;
$limit = $url_per_page;
$sql = "SELECT DISTINCT $field FROM $table WHERE not $field like '%gif%' and not $field like '%jpg%' and not $field = '' LIMIT $limit OFFSET $offset";
$result = mysqli_query($link,$sql);
$output .= map_start();
while ($row = $result->fetch_array()) {
$value = $row[0];
$value_urlencode = urlencode($value);
if (strpos($value,'&') == FALSE) {
$output .= "<url>";
$output .= "<loc>https://$http_host/$path/$value_urlencode</loc>";
$output .= "<lastmod>$lastmod</lastmod>";
$output .= "<changefreq>$changefreq</changefreq>";
$output .= "<priority>$priority</priority>";
$output .= "</url>\n";
}
}
$output .= map_end();
}
# 打印输出
print $output;
# 定义sitemap和index开头和结尾的内容
function map_start() {
$http_host = $_SERVER['HTTP_HOST'];
$output = '<?xml version="1.0" encoding="UTF-8"?>'."\n";
$output .= '<?xml-stylesheet type="text/xsl" href="https://'.$http_host.'/sitemap.xsl"?>'."\n";
$output .= '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'."\n";
return $output;
}
function map_end() {
$output = "</urlset>\n";
return $output;
}
function index_start() {
$http_host = $_SERVER['HTTP_HOST'];
$output = '<?xml version="1.0" encoding="UTF-8"?>'."\n";
$output .= '<?xml-stylesheet type="text/xsl" href="https://'.$http_host.'/sitemap.xsl"?>'."\n";
$output .= '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">'."\n";
return $output;
}
function index_end() {
$output = "</sitemapindex>\n";
return $output;
}
?>
上面程序的缩进格式有点问题,看起来不太清晰,我后面再来调整。另外就是在robots.txt中增加如下这样的内容让搜索引擎来发现网站地图:
# sitemap start Sitemap: https://zidian.18dao.net/sitemap.xml Sitemap: https://zidian.18dao.net/rss.xml Sitemap: https://zidian.18dao.net/sitemap_danzi.xml Sitemap: https://zidian.18dao.net/sitemap_bushou.xml Sitemap: https://zidian.18dao.net/sitemap_bihua.xml Sitemap: https://zidian.18dao.net/sitemap_bushoubihua.xml # sitemap end
并且到Google Search Console、百度站长平台等地方去主动提交sitemap。
评论