sitemaps.php 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549
  1. <?php
  2. /**
  3. * Generate sitemap files in base XML as well as some namespace extensions.
  4. *
  5. * This module generates two different base sitemaps.
  6. *
  7. * 1. sitemap.xml
  8. * The basic sitemap is updated regularly by wp-cron. It is stored in the
  9. * database and retrieved when requested. This sitemap aims to include canonical
  10. * URLs for all published content and abide by the sitemap spec. This is the root
  11. * of a tree of sitemap and sitemap index xml files, depending on the number of URLs.
  12. *
  13. * By default the sitemap contains published posts of type 'post' and 'page', as
  14. * well as the home url. To include other post types use the 'jetpack_sitemap_post_types'
  15. * filter.
  16. *
  17. * @link http://sitemaps.org/protocol.php Base sitemaps protocol.
  18. * @link https://support.google.com/webmasters/answer/178636 Image sitemap extension.
  19. * @link https://developers.google.com/webmasters/videosearch/sitemaps Video sitemap extension.
  20. *
  21. * 2. news-sitemap.xml
  22. * The news sitemap is generated on the fly when requested. It does not aim for
  23. * completeness, instead including at most 1000 of the most recent published posts
  24. * from the previous 2 days, per the news-sitemap spec.
  25. *
  26. * @link http://www.google.com/support/webmasters/bin/answer.py?answer=74288 News sitemap extension.
  27. *
  28. * @package Jetpack
  29. * @since 3.9.0
  30. * @since 4.8.0 Remove 1000 post limit.
  31. * @author Automattic
  32. */
  33. require_once dirname( __FILE__ ) . '/sitemap-constants.php';
  34. require_once dirname( __FILE__ ) . '/sitemap-buffer.php';
  35. require_once dirname( __FILE__ ) . '/sitemap-stylist.php';
  36. require_once dirname( __FILE__ ) . '/sitemap-librarian.php';
  37. require_once dirname( __FILE__ ) . '/sitemap-finder.php';
  38. require_once dirname( __FILE__ ) . '/sitemap-builder.php';
  39. if ( defined( 'WP_DEBUG' ) && WP_DEBUG ) {
  40. require_once dirname( __FILE__ ) . '/sitemap-logger.php';
  41. }
  42. /**
  43. * Governs the generation, storage, and serving of sitemaps.
  44. *
  45. * @since 4.8.0
  46. */
  47. class Jetpack_Sitemap_Manager {
  48. /**
  49. * @see Jetpack_Sitemap_Librarian
  50. * @since 4.8.0
  51. * @var Jetpack_Sitemap_Librarian $librarian Librarian object for storing and retrieving sitemap data.
  52. */
  53. private $librarian;
  54. /**
  55. * @see Jetpack_Sitemap_Logger
  56. * @since 4.8.0
  57. * @var Jetpack_Sitemap_Logger $logger Logger object for reporting debug messages.
  58. */
  59. private $logger;
  60. /**
  61. * @see Jetpack_Sitemap_Finder
  62. * @since 4.8.0
  63. * @var Jetpack_Sitemap_Finder $finder Finder object for dealing with sitemap URIs.
  64. */
  65. private $finder;
  66. /**
  67. * Construct a new Jetpack_Sitemap_Manager.
  68. *
  69. * @access public
  70. * @since 4.8.0
  71. */
  72. public function __construct() {
  73. $this->librarian = new Jetpack_Sitemap_Librarian();
  74. $this->finder = new Jetpack_Sitemap_Finder();
  75. if ( defined( 'WP_DEBUG' ) && ( true === WP_DEBUG ) ) {
  76. $this->logger = new Jetpack_Sitemap_Logger();
  77. }
  78. // Add callback for sitemap URL handler.
  79. add_action(
  80. 'init',
  81. array( $this, 'callback_action_catch_sitemap_urls' ),
  82. defined( 'IS_WPCOM' ) && IS_WPCOM ? 100 : 10
  83. );
  84. // Add generator to wp_cron task list.
  85. $this->schedule_sitemap_generation();
  86. // Add sitemap to robots.txt.
  87. add_action(
  88. 'do_robotstxt',
  89. array( $this, 'callback_action_do_robotstxt' ),
  90. 20
  91. );
  92. // The news sitemap is cached; here we add a callback to
  93. // flush the cached news sitemap when a post is published.
  94. add_action(
  95. 'publish_post',
  96. array( $this, 'callback_action_flush_news_sitemap_cache' ),
  97. 10
  98. );
  99. // In case we need to purge all sitemaps, we do this.
  100. add_action(
  101. 'jetpack_sitemaps_purge_data',
  102. array( $this, 'callback_action_purge_data' )
  103. );
  104. /*
  105. * Module parameters are stored as options in the database.
  106. * This allows us to avoid having to process all of init
  107. * before serving the sitemap data. The following actions
  108. * process and store these filters.
  109. */
  110. // Process filters and store location string for sitemap.
  111. add_action(
  112. 'init',
  113. array( $this, 'callback_action_filter_sitemap_location' ),
  114. 999
  115. );
  116. return;
  117. }
  118. /**
  119. * Echo a raw string of given content-type.
  120. *
  121. * @access private
  122. * @since 4.8.0
  123. *
  124. * @param string $the_content_type The content type to be served.
  125. * @param string $the_content The string to be echoed.
  126. */
  127. private function serve_raw_and_die( $the_content_type, $the_content ) {
  128. header( 'Content-Type: ' . $the_content_type . '; charset=UTF-8' );
  129. global $wp_query;
  130. $wp_query->is_feed = true;
  131. set_query_var( 'feed', 'sitemap' );
  132. if ( '' === $the_content ) {
  133. wp_die(
  134. esc_html__( "No sitemap found. Maybe it's being generated. Please try again later.", 'jetpack' ),
  135. esc_html__( 'Sitemaps', 'jetpack' ),
  136. array(
  137. 'response' => 404,
  138. )
  139. );
  140. }
  141. echo $the_content;
  142. die();
  143. }
  144. /**
  145. * Callback to intercept sitemap url requests and serve sitemap files.
  146. *
  147. * @access public
  148. * @since 4.8.0
  149. */
  150. public function callback_action_catch_sitemap_urls() {
  151. // Regular expressions for sitemap URL routing.
  152. $regex = array(
  153. 'master' => '/^sitemap\.xml$/',
  154. 'sitemap' => '/^sitemap-[1-9][0-9]*\.xml$/',
  155. 'index' => '/^sitemap-index-[1-9][0-9]*\.xml$/',
  156. 'sitemap-style' => '/^sitemap\.xsl$/',
  157. 'index-style' => '/^sitemap-index\.xsl$/',
  158. 'image' => '/^image-sitemap-[1-9][0-9]*\.xml$/',
  159. 'image-index' => '/^image-sitemap-index-[1-9][0-9]*\.xml$/',
  160. 'image-style' => '/^image-sitemap\.xsl$/',
  161. 'video' => '/^video-sitemap-[1-9][0-9]*\.xml$/',
  162. 'video-index' => '/^video-sitemap-index-[1-9][0-9]*\.xml$/',
  163. 'video-style' => '/^video-sitemap\.xsl$/',
  164. 'news' => '/^news-sitemap\.xml$/',
  165. 'news-style' => '/^news-sitemap\.xsl$/',
  166. );
  167. // The raw path(+query) of the requested URI.
  168. if ( isset( $_SERVER['REQUEST_URI'] ) ) { // WPCS: Input var okay.
  169. $raw_uri = sanitize_text_field(
  170. wp_unslash( $_SERVER['REQUEST_URI'] ) // WPCS: Input var okay.
  171. );
  172. } else {
  173. $raw_uri = '';
  174. }
  175. $request = $this->finder->recognize_sitemap_uri( $raw_uri );
  176. if ( isset( $request['sitemap_name'] ) ) {
  177. /**
  178. * Filter the content type used to serve the sitemap XML files.
  179. *
  180. * @module sitemaps
  181. *
  182. * @since 3.9.0
  183. *
  184. * @param string $xml_content_type By default, it's 'text/xml'.
  185. */
  186. $xml_content_type = apply_filters( 'jetpack_sitemap_content_type', 'text/xml' );
  187. // Catch master sitemap xml.
  188. if ( preg_match( $regex['master'], $request['sitemap_name'] ) ) {
  189. $this->serve_raw_and_die(
  190. $xml_content_type,
  191. $this->librarian->get_sitemap_text(
  192. jp_sitemap_filename( JP_MASTER_SITEMAP_TYPE, 0 ),
  193. JP_MASTER_SITEMAP_TYPE
  194. )
  195. );
  196. }
  197. // Catch sitemap xml.
  198. if ( preg_match( $regex['sitemap'], $request['sitemap_name'] ) ) {
  199. $this->serve_raw_and_die(
  200. $xml_content_type,
  201. $this->librarian->get_sitemap_text(
  202. $request['sitemap_name'],
  203. JP_PAGE_SITEMAP_TYPE
  204. )
  205. );
  206. }
  207. // Catch sitemap index xml.
  208. if ( preg_match( $regex['index'], $request['sitemap_name'] ) ) {
  209. $this->serve_raw_and_die(
  210. $xml_content_type,
  211. $this->librarian->get_sitemap_text(
  212. $request['sitemap_name'],
  213. JP_PAGE_SITEMAP_INDEX_TYPE
  214. )
  215. );
  216. }
  217. // Catch sitemap xsl.
  218. if ( preg_match( $regex['sitemap-style'], $request['sitemap_name'] ) ) {
  219. $this->serve_raw_and_die(
  220. 'application/xml',
  221. Jetpack_Sitemap_Stylist::sitemap_xsl()
  222. );
  223. }
  224. // Catch sitemap index xsl.
  225. if ( preg_match( $regex['index-style'], $request['sitemap_name'] ) ) {
  226. $this->serve_raw_and_die(
  227. 'application/xml',
  228. Jetpack_Sitemap_Stylist::sitemap_index_xsl()
  229. );
  230. }
  231. // Catch image sitemap xml.
  232. if ( preg_match( $regex['image'], $request['sitemap_name'] ) ) {
  233. $this->serve_raw_and_die(
  234. $xml_content_type,
  235. $this->librarian->get_sitemap_text(
  236. $request['sitemap_name'],
  237. JP_IMAGE_SITEMAP_TYPE
  238. )
  239. );
  240. }
  241. // Catch image sitemap index xml.
  242. if ( preg_match( $regex['image-index'], $request['sitemap_name'] ) ) {
  243. $this->serve_raw_and_die(
  244. $xml_content_type,
  245. $this->librarian->get_sitemap_text(
  246. $request['sitemap_name'],
  247. JP_IMAGE_SITEMAP_INDEX_TYPE
  248. )
  249. );
  250. }
  251. // Catch image sitemap xsl.
  252. if ( preg_match( $regex['image-style'], $request['sitemap_name'] ) ) {
  253. $this->serve_raw_and_die(
  254. 'application/xml',
  255. Jetpack_Sitemap_Stylist::image_sitemap_xsl()
  256. );
  257. }
  258. // Catch video sitemap xml.
  259. if ( preg_match( $regex['video'], $request['sitemap_name'] ) ) {
  260. $this->serve_raw_and_die(
  261. $xml_content_type,
  262. $this->librarian->get_sitemap_text(
  263. $request['sitemap_name'],
  264. JP_VIDEO_SITEMAP_TYPE
  265. )
  266. );
  267. }
  268. // Catch video sitemap index xml.
  269. if ( preg_match( $regex['video-index'], $request['sitemap_name'] ) ) {
  270. $this->serve_raw_and_die(
  271. $xml_content_type,
  272. $this->librarian->get_sitemap_text(
  273. $request['sitemap_name'],
  274. JP_VIDEO_SITEMAP_INDEX_TYPE
  275. )
  276. );
  277. }
  278. // Catch video sitemap xsl.
  279. if ( preg_match( $regex['video-style'], $request['sitemap_name'] ) ) {
  280. $this->serve_raw_and_die(
  281. 'application/xml',
  282. Jetpack_Sitemap_Stylist::video_sitemap_xsl()
  283. );
  284. }
  285. // Catch news sitemap xml.
  286. if ( preg_match( $regex['news'], $request['sitemap_name'] ) ) {
  287. $sitemap_builder = new Jetpack_Sitemap_Builder();
  288. $this->serve_raw_and_die(
  289. $xml_content_type,
  290. $sitemap_builder->news_sitemap_xml()
  291. );
  292. }
  293. // Catch news sitemap xsl.
  294. if ( preg_match( $regex['news-style'], $request['sitemap_name'] ) ) {
  295. $this->serve_raw_and_die(
  296. 'application/xml',
  297. Jetpack_Sitemap_Stylist::news_sitemap_xsl()
  298. );
  299. }
  300. }
  301. // URL did not match any sitemap patterns.
  302. return;
  303. }
  304. /**
  305. * Callback for adding sitemap-interval to the list of schedules.
  306. *
  307. * @access public
  308. * @since 4.8.0
  309. *
  310. * @param array $schedules The array of WP_Cron schedules.
  311. *
  312. * @return array The updated array of WP_Cron schedules.
  313. */
  314. public function callback_add_sitemap_schedule( $schedules ) {
  315. $schedules['sitemap-interval'] = array(
  316. 'interval' => JP_SITEMAP_INTERVAL,
  317. 'display' => __( 'Sitemap Interval', 'jetpack' ),
  318. );
  319. return $schedules;
  320. }
  321. /**
  322. * Callback handler for sitemap cron hook
  323. *
  324. * @access public
  325. */
  326. public function callback_sitemap_cron_hook() {
  327. $sitemap_builder = new Jetpack_Sitemap_Builder();
  328. $sitemap_builder->update_sitemap();
  329. }
  330. /**
  331. * Add actions to schedule sitemap generation.
  332. * Should only be called once, in the constructor.
  333. *
  334. * @access private
  335. * @since 4.8.0
  336. */
  337. private function schedule_sitemap_generation() {
  338. // Add cron schedule.
  339. add_filter( 'cron_schedules', array( $this, 'callback_add_sitemap_schedule' ) );
  340. add_action(
  341. 'jp_sitemap_cron_hook',
  342. array( $this, 'callback_sitemap_cron_hook' )
  343. );
  344. if ( ! wp_next_scheduled( 'jp_sitemap_cron_hook' ) ) {
  345. wp_schedule_event(
  346. time(),
  347. 'sitemap-interval',
  348. 'jp_sitemap_cron_hook'
  349. );
  350. }
  351. }
  352. /**
  353. * Callback to add sitemap to robots.txt.
  354. *
  355. * @access public
  356. * @since 4.8.0
  357. */
  358. public function callback_action_do_robotstxt() {
  359. /**
  360. * Filter whether to make the default sitemap discoverable to robots or not. Default true.
  361. *
  362. * @module sitemaps
  363. * @since 3.9.0
  364. *
  365. * @param bool $discover_sitemap Make default sitemap discoverable to robots.
  366. */
  367. $discover_sitemap = apply_filters( 'jetpack_sitemap_generate', true );
  368. if ( true === $discover_sitemap ) {
  369. $sitemap_url = $this->finder->construct_sitemap_url( 'sitemap.xml' );
  370. echo 'Sitemap: ' . esc_url( $sitemap_url ) . "\n";
  371. }
  372. /**
  373. * Filter whether to make the news sitemap discoverable to robots or not. Default true.
  374. *
  375. * @module sitemaps
  376. * @since 3.9.0
  377. *
  378. * @param bool $discover_news_sitemap Make default news sitemap discoverable to robots.
  379. */
  380. $discover_news_sitemap = apply_filters( 'jetpack_news_sitemap_generate', true );
  381. if ( true === $discover_news_sitemap ) {
  382. $news_sitemap_url = $this->finder->construct_sitemap_url( 'news-sitemap.xml' );
  383. echo 'Sitemap: ' . esc_url( $news_sitemap_url ) . "\n";
  384. }
  385. return;
  386. }
  387. /**
  388. * Callback to delete the news sitemap cache.
  389. *
  390. * @access public
  391. * @since 4.8.0
  392. */
  393. public function callback_action_flush_news_sitemap_cache() {
  394. delete_transient( 'jetpack_news_sitemap_xml' );
  395. }
  396. /**
  397. * Callback for resetting stored sitemap data.
  398. *
  399. * @access public
  400. * @since 5.3.0
  401. */
  402. public function callback_action_purge_data() {
  403. $this->callback_action_flush_news_sitemap_cache();
  404. $this->librarian->delete_all_stored_sitemap_data();
  405. }
  406. /**
  407. * Callback to set the sitemap location.
  408. *
  409. * @access public
  410. * @since 4.8.0
  411. */
  412. public function callback_action_filter_sitemap_location() {
  413. update_option(
  414. 'jetpack_sitemap_location',
  415. /**
  416. * Additional path for sitemap URIs. Default value is empty.
  417. *
  418. * This string is any additional path fragment you want included between
  419. * the home URL and the sitemap filenames. Exactly how this fragment is
  420. * interpreted depends on your permalink settings. For example:
  421. *
  422. * Pretty permalinks:
  423. * home_url() . jetpack_sitemap_location . '/sitemap.xml'
  424. *
  425. * Plain ("ugly") permalinks:
  426. * home_url() . jetpack_sitemap_location . '/?jetpack-sitemap=sitemap.xml'
  427. *
  428. * PATHINFO permalinks:
  429. * home_url() . '/index.php' . jetpack_sitemap_location . '/sitemap.xml'
  430. *
  431. * where 'sitemap.xml' is the name of a specific sitemap file.
  432. * The value of this filter must be a valid path fragment per RFC 3986;
  433. * in particular it must either be empty or begin with a '/'.
  434. * Also take care that any restrictions on sitemap location imposed by
  435. * the sitemap protocol are satisfied.
  436. *
  437. * The result of this filter is stored in an option, 'jetpack_sitemap_location';
  438. * that option is what gets read when the sitemap location is needed.
  439. * This way we don't have to wait for init to finish before building sitemaps.
  440. *
  441. * @link https://tools.ietf.org/html/rfc3986#section-3.3 RFC 3986
  442. * @link http://www.sitemaps.org/ The sitemap protocol
  443. *
  444. * @since 4.8.0
  445. */
  446. apply_filters(
  447. 'jetpack_sitemap_location',
  448. ''
  449. )
  450. );
  451. return;
  452. }
  453. } // End Jetpack_Sitemap_Manager class.
  454. new Jetpack_Sitemap_Manager();
  455. /**
  456. * Absolute URL of the current blog's sitemap.
  457. *
  458. * @module sitemaps
  459. *
  460. * @since 3.9.0
  461. * @since 4.8.1 Code uses method found in Jetpack_Sitemap_Finder::construct_sitemap_url in 4.8.0.
  462. * It has been moved here to avoid fatal errors with other plugins that were expecting to find this function.
  463. *
  464. * @param string $filename Sitemap file name. Defaults to 'sitemap.xml', the initial sitemaps page.
  465. *
  466. * @return string Sitemap URL.
  467. */
  468. function jetpack_sitemap_uri( $filename = 'sitemap.xml' ) {
  469. global $wp_rewrite;
  470. $location = Jetpack_Options::get_option_and_ensure_autoload( 'jetpack_sitemap_location', '' );
  471. if ( $wp_rewrite->using_index_permalinks() ) {
  472. $sitemap_url = home_url( '/index.php' . $location . '/' . $filename );
  473. } elseif ( $wp_rewrite->using_permalinks() ) {
  474. $sitemap_url = home_url( $location . '/' . $filename );
  475. } else {
  476. $sitemap_url = home_url( $location . '/?jetpack-sitemap=' . $filename );
  477. }
  478. /**
  479. * Filter sitemap URL relative to home URL.
  480. *
  481. * @module sitemaps
  482. *
  483. * @since 3.9.0
  484. *
  485. * @param string $sitemap_url Sitemap URL.
  486. */
  487. return apply_filters( 'jetpack_sitemap_location', $sitemap_url );
  488. }