gfm.php 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400
  1. <?php
  2. /**
  3. * GitHub-Flavoured Markdown. Inspired by Evan's plugin, but modified.
  4. *
  5. * @author Evan Solomon
  6. * @author Matt Wiebe <wiebe@automattic.com>
  7. * @link https://github.com/evansolomon/wp-github-flavored-markdown-comments
  8. *
  9. * Add a few extras from GitHub's Markdown implementation. Must be used in a WordPress environment.
  10. */
  11. class WPCom_GHF_Markdown_Parser extends MarkdownExtra_Parser {
  12. /**
  13. * Hooray somewhat arbitrary numbers that are fearful of 1.0.x.
  14. */
  15. const WPCOM_GHF_MARDOWN_VERSION = '0.9.0';
  16. /**
  17. * Use a [code] shortcode when encountering a fenced code block
  18. * @var boolean
  19. */
  20. public $use_code_shortcode = true;
  21. /**
  22. * Preserve shortcodes, untouched by Markdown.
  23. * This requires use within a WordPress installation.
  24. * @var boolean
  25. */
  26. public $preserve_shortcodes = true;
  27. /**
  28. * Preserve the legacy $latex your-latex-code-here$ style
  29. * LaTeX markup
  30. */
  31. public $preserve_latex = true;
  32. /**
  33. * Preserve single-line <code> blocks.
  34. * @var boolean
  35. */
  36. public $preserve_inline_code_blocks = true;
  37. /**
  38. * Strip paragraphs from the output. This is the right default for WordPress,
  39. * which generally wants to create its own paragraphs with `wpautop`
  40. * @var boolean
  41. */
  42. public $strip_paras = true;
  43. // Will run through sprintf - you can supply your own syntax if you want
  44. public $shortcode_start = '[code lang=%s]';
  45. public $shortcode_end = '[/code]';
  46. // Stores shortcodes we remove and then replace
  47. protected $preserve_text_hash = array();
  48. /**
  49. * Set environment defaults based on presence of key functions/classes.
  50. */
  51. public function __construct() {
  52. $this->use_code_shortcode = class_exists( 'SyntaxHighlighter' );
  53. /**
  54. * Allow processing shortcode contents.
  55. *
  56. * @module markdown
  57. *
  58. * @since 4.4.0
  59. *
  60. * @param boolean $preserve_shortcodes Defaults to $this->preserve_shortcodes.
  61. */
  62. $this->preserve_shortcodes = apply_filters( 'jetpack_markdown_preserve_shortcodes', $this->preserve_shortcodes ) && function_exists( 'get_shortcode_regex' );
  63. $this->preserve_latex = function_exists( 'latex_markup' );
  64. $this->strip_paras = function_exists( 'wpautop' );
  65. parent::__construct();
  66. }
  67. /**
  68. * Overload to specify heading styles only if the hash has space(s) after it. This is actually in keeping with
  69. * the documentation and eases the semantic overload of the hash character.
  70. * #Will Not Produce a Heading 1
  71. * # This Will Produce a Heading 1
  72. *
  73. * @param string $text Markdown text
  74. * @return string HTML-transformed text
  75. */
  76. public function transform( $text ) {
  77. // Preserve anything inside a single-line <code> element
  78. if ( $this->preserve_inline_code_blocks ) {
  79. $text = $this->single_line_code_preserve( $text );
  80. }
  81. // Remove all shortcodes so their interiors are left intact
  82. if ( $this->preserve_shortcodes ) {
  83. $text = $this->shortcode_preserve( $text );
  84. }
  85. // Remove legacy LaTeX so it's left intact
  86. if ( $this->preserve_latex ) {
  87. $text = $this->latex_preserve( $text );
  88. }
  89. // escape line-beginning # chars that do not have a space after them.
  90. $text = preg_replace_callback( '|^#{1,6}( )?|um', array( $this, '_doEscapeForHashWithoutSpacing' ), $text );
  91. /**
  92. * Allow third-party plugins to define custom patterns that won't be processed by Markdown.
  93. *
  94. * @module markdown
  95. *
  96. * @since 3.9.2
  97. *
  98. * @param array $custom_patterns Array of custom patterns to be ignored by Markdown.
  99. */
  100. $custom_patterns = apply_filters( 'jetpack_markdown_preserve_pattern', array() );
  101. if ( is_array( $custom_patterns ) && ! empty( $custom_patterns ) ) {
  102. foreach ( $custom_patterns as $pattern ) {
  103. $text = preg_replace_callback( $pattern, array( $this, '_doRemoveText'), $text );
  104. }
  105. }
  106. // run through core Markdown
  107. $text = parent::transform( $text );
  108. // Occasionally Markdown Extra chokes on a para structure, producing odd paragraphs.
  109. $text = str_replace( "<p>&lt;</p>\n\n<p>p>", '<p>', $text );
  110. // put start-of-line # chars back in place
  111. $text = $this->restore_leading_hash( $text );
  112. // Strip paras if set
  113. if ( $this->strip_paras ) {
  114. $text = $this->unp( $text );
  115. }
  116. // Restore preserved things like shortcodes/LaTeX
  117. $text = $this->do_restore( $text );
  118. return $text;
  119. }
  120. /**
  121. * Prevents blocks like <code>__this__</code> from turning into <code><strong>this</strong></code>
  122. * @param string $text Text that may need preserving
  123. * @return string Text that was preserved if needed
  124. */
  125. public function single_line_code_preserve( $text ) {
  126. return preg_replace_callback( '|<code\b[^>]*>(.*?)</code>|', array( $this, 'do_single_line_code_preserve' ), $text );
  127. }
  128. /**
  129. * Regex callback for inline code presevation
  130. * @param array $matches Regex matches
  131. * @return string Hashed content for later restoration
  132. */
  133. public function do_single_line_code_preserve( $matches ) {
  134. return '<code>' . $this->hash_block( $matches[1] ) . '</code>';
  135. }
  136. /**
  137. * Preserve code block contents by HTML encoding them. Useful before getting to KSES stripping.
  138. * @param string $text Markdown/HTML content
  139. * @return string Markdown/HTML content with escaped code blocks
  140. */
  141. public function codeblock_preserve( $text ) {
  142. return preg_replace_callback( "/^([`~]{3})([^`\n]+)?\n([^`~]+)(\\1)/m", array( $this, 'do_codeblock_preserve' ), $text );
  143. }
  144. /**
  145. * Regex callback for code block preservation.
  146. * @param array $matches Regex matches
  147. * @return string Codeblock with escaped interior
  148. */
  149. public function do_codeblock_preserve( $matches ) {
  150. $block = stripslashes( $matches[3] );
  151. $block = esc_html( $block );
  152. $block = str_replace( '\\', '\\\\', $block );
  153. $open = $matches[1] . $matches[2] . "\n";
  154. return $open . $block . $matches[4];
  155. }
  156. /**
  157. * Restore previously preserved (i.e. escaped) code block contents.
  158. * @param string $text Markdown/HTML content with escaped code blocks
  159. * @return string Markdown/HTML content
  160. */
  161. public function codeblock_restore( $text ) {
  162. return preg_replace_callback( "/^([`~]{3})([^`\n]+)?\n([^`~]+)(\\1)/m", array( $this, 'do_codeblock_restore' ), $text );
  163. }
  164. /**
  165. * Regex callback for code block restoration (unescaping).
  166. * @param array $matches Regex matches
  167. * @return string Codeblock with unescaped interior
  168. */
  169. public function do_codeblock_restore( $matches ) {
  170. $block = html_entity_decode( $matches[3], ENT_QUOTES );
  171. $open = $matches[1] . $matches[2] . "\n";
  172. return $open . $block . $matches[4];
  173. }
  174. /**
  175. * Called to preserve legacy LaTeX like $latex some-latex-text $
  176. * @param string $text Text in which to preserve LaTeX
  177. * @return string Text with LaTeX replaced by a hash that will be restored later
  178. */
  179. protected function latex_preserve( $text ) {
  180. // regex from latex_remove()
  181. $regex = '%
  182. \$latex(?:=\s*|\s+)
  183. ((?:
  184. [^$]+ # Not a dollar
  185. |
  186. (?<=(?<!\\\\)\\\\)\$ # Dollar preceded by exactly one slash
  187. )+)
  188. (?<!\\\\)\$ # Dollar preceded by zero slashes
  189. %ix';
  190. $text = preg_replace_callback( $regex, array( $this, '_doRemoveText'), $text );
  191. return $text;
  192. }
  193. /**
  194. * Called to preserve WP shortcodes from being formatted by Markdown in any way.
  195. * @param string $text Text in which to preserve shortcodes
  196. * @return string Text with shortcodes replaced by a hash that will be restored later
  197. */
  198. protected function shortcode_preserve( $text ) {
  199. $text = preg_replace_callback( $this->get_shortcode_regex(), array( $this, '_doRemoveText' ), $text );
  200. return $text;
  201. }
  202. /**
  203. * Restores any text preserved by $this->hash_block()
  204. * @param string $text Text that may have hashed preservation placeholders
  205. * @return string Text with hashed preseravtion placeholders replaced by original text
  206. */
  207. protected function do_restore( $text ) {
  208. // Reverse hashes to ensure nested blocks are restored.
  209. $hashes = array_reverse( $this->preserve_text_hash, true );
  210. foreach( $hashes as $hash => $value ) {
  211. $placeholder = $this->hash_maker( $hash );
  212. $text = str_replace( $placeholder, $value, $text );
  213. }
  214. // reset the hash
  215. $this->preserve_text_hash = array();
  216. return $text;
  217. }
  218. /**
  219. * Regex callback for text preservation
  220. * @param array $m Regex $matches array
  221. * @return string A placeholder that will later be replaced by the original text
  222. */
  223. protected function _doRemoveText( $m ) {
  224. return $this->hash_block( $m[0] );
  225. }
  226. /**
  227. * Call this to store a text block for later restoration.
  228. * @param string $text Text to preserve for later
  229. * @return string Placeholder that will be swapped out later for the original text
  230. */
  231. protected function hash_block( $text ) {
  232. $hash = md5( $text );
  233. $this->preserve_text_hash[ $hash ] = $text;
  234. $placeholder = $this->hash_maker( $hash );
  235. return $placeholder;
  236. }
  237. /**
  238. * Less glamorous than the Keymaker
  239. * @param string $hash An md5 hash
  240. * @return string A placeholder hash
  241. */
  242. protected function hash_maker( $hash ) {
  243. return 'MARKDOWN_HASH' . $hash . 'MARKDOWN_HASH';
  244. }
  245. /**
  246. * Remove bare <p> elements. <p>s with attributes will be preserved.
  247. * @param string $text HTML content
  248. * @return string <p>-less content
  249. */
  250. public function unp( $text ) {
  251. return preg_replace( "#<p>(.*?)</p>(\n|$)#ums", '$1$2', $text );
  252. }
  253. /**
  254. * A regex of all shortcodes currently registered by the current
  255. * WordPress installation
  256. * @uses get_shortcode_regex()
  257. * @return string A regex for grabbing shortcodes.
  258. */
  259. protected function get_shortcode_regex() {
  260. $pattern = get_shortcode_regex();
  261. // don't match markdown link anchors that could be mistaken for shortcodes.
  262. $pattern .= '(?!\()';
  263. return "/$pattern/s";
  264. }
  265. /**
  266. * Since we escape unspaced #Headings, put things back later.
  267. * @param string $text text with a leading escaped hash
  268. * @return string text with leading hashes unescaped
  269. */
  270. protected function restore_leading_hash( $text ) {
  271. return preg_replace( "/^(<p>)?(&#35;|\\\\#)/um", "$1#", $text );
  272. }
  273. /**
  274. * Overload to support ```-fenced code blocks for pre-Markdown Extra 1.2.8
  275. * https://help.github.com/articles/github-flavored-markdown#fenced-code-blocks
  276. */
  277. public function doFencedCodeBlocks( $text ) {
  278. // If we're at least at 1.2.8, native fenced code blocks are in.
  279. // Below is just copied from it in case we somehow got loaded on
  280. // top of someone else's Markdown Extra
  281. if ( version_compare( MARKDOWNEXTRA_VERSION, '1.2.8', '>=' ) )
  282. return parent::doFencedCodeBlocks( $text );
  283. #
  284. # Adding the fenced code block syntax to regular Markdown:
  285. #
  286. # ~~~
  287. # Code block
  288. # ~~~
  289. #
  290. $less_than_tab = $this->tab_width;
  291. $text = preg_replace_callback('{
  292. (?:\n|\A)
  293. # 1: Opening marker
  294. (
  295. (?:~{3,}|`{3,}) # 3 or more tildes/backticks.
  296. )
  297. [ ]*
  298. (?:
  299. \.?([-_:a-zA-Z0-9]+) # 2: standalone class name
  300. |
  301. '.$this->id_class_attr_catch_re.' # 3: Extra attributes
  302. )?
  303. [ ]* \n # Whitespace and newline following marker.
  304. # 4: Content
  305. (
  306. (?>
  307. (?!\1 [ ]* \n) # Not a closing marker.
  308. .*\n+
  309. )+
  310. )
  311. # Closing marker.
  312. \1 [ ]* (?= \n )
  313. }xm',
  314. array($this, '_doFencedCodeBlocks_callback'), $text);
  315. return $text;
  316. }
  317. /**
  318. * Callback for pre-processing start of line hashes to slyly escape headings that don't
  319. * have a leading space
  320. * @param array $m preg_match matches
  321. * @return string possibly escaped start of line hash
  322. */
  323. public function _doEscapeForHashWithoutSpacing( $m ) {
  324. if ( ! isset( $m[1] ) )
  325. $m[0] = '\\' . $m[0];
  326. return $m[0];
  327. }
  328. /**
  329. * Overload to support Viper's [code] shortcode. Because awesome.
  330. */
  331. public function _doFencedCodeBlocks_callback( $matches ) {
  332. // in case we have some escaped leading hashes right at the start of the block
  333. $matches[4] = $this->restore_leading_hash( $matches[4] );
  334. // just MarkdownExtra_Parser if we're not going ultra-deluxe
  335. if ( ! $this->use_code_shortcode ) {
  336. return parent::_doFencedCodeBlocks_callback( $matches );
  337. }
  338. // default to a "text" class if one wasn't passed. Helps with encoding issues later.
  339. if ( empty( $matches[2] ) ) {
  340. $matches[2] = 'text';
  341. }
  342. $classname =& $matches[2];
  343. $codeblock = preg_replace_callback('/^\n+/', array( $this, '_doFencedCodeBlocks_newlines' ), $matches[4] );
  344. if ( $classname{0} == '.' )
  345. $classname = substr( $classname, 1 );
  346. $codeblock = esc_html( $codeblock );
  347. $codeblock = sprintf( $this->shortcode_start, $classname ) . "\n{$codeblock}" . $this->shortcode_end;
  348. return "\n\n" . $this->hashBlock( $codeblock ). "\n\n";
  349. }
  350. }