/** Wikitext sanitation for MediaWiki Author: [[User:Lupo]], January 2008 License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0) Choose whichever license of these you like best :-) */ // /* global mw:false, TextCleaner:true */ /* eslint no-control-regex:0, one-var:0, vars-on-top:0, camelcase:0, curly:0, space-in-parens:0, computed-property-spacing:0, array-bracket-spacing:0 */ /* jshint curly:false, eqnull:true, laxbreak:true */ (function () { 'use strict'; window.TextCleaner = { imgNamespaceNames: null, // This function attempts to construct well-formed wikitext from input that may contain // possibly broken wikitext. // // Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence // of templates, and due to the fact that image thumbnail captions may themselves contain // links. This implementation catches the most common errors (such as forgetting to close a // template or a link), and even some more elaborate ones. With enough malice, this sanitation // can still be broken by user input such that the result is not well-formed wikitext as the // parser at the servers would like to have it. (It's still possible that the result is broken // wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext // into broken wikitext.) // // If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original // image link was a thumbnail or had a width smaller than 300px specified. // // WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is // probably rather inefficient due to the many substrings that are generated. This function is // primarily intended to be used to clean up user input in forms, which are typically rather // short. sanitizeWikiText: function (input, only_thumbs) { if (input.search(/[\][}{]|]*)?>|', next + 3); if (i < 0) { result += s + '-->'; s = ''; } else { result += s.substring(0, i + 3); s = s.substring(i + 3); } } else if (s.charAt(next + 1) === 'n') { // Nowiki may contain HTML comments! in_nowiki = true; regexp = nowiki_regexp; result += s.substring(0, next + 7); s = s.substring(next + 7); } else { // End of nowiki. Searched for and found only if in_nowiki === true in_nowiki = false; regexp = base_regexp; i = s.indexOf('>', next + 1); // End of tag result += s.substring(0, i + 1); s = s.substring(i + 1); } break; case '\x05': // Table start if (!with_tables) { result += s.substring(0, next); get_out = true; break; } /* fall through */ case '\x07': if (ch === '\x07' && !with_galleries) { result += s.substring(0, next); get_out = true; break; } /* fall through */ case '\x01': // Start of template, table, or gallery result += s.substring(0, next + 1); push_end(String.fromCharCode(ch.charCodeAt(0) + 1).charAt(0)); s = s.substring(next + 1); break; case '\x06': // Table end if (break_at_pipe && !endings) { result += s.substring(0, next); get_out = true; break; } /* fall through */ case '\x02': // End of a template or table result += s.substring(0, next); if (!endings || endings[endings.length - 1] !== ch) { // Spurious template or table end if (ch === '\x02') result += '}}'; else result += '|}'; } else { result += pop_end(); } s = s.substring(next + 1); break; case '\x08': // End of gallery result += s.substring(0, next + 1); if (endings && endings[endings.length - 1] === ch) pop_end(); s = s.substring(next + 1); break; case '\x03': case '[': { if (!with_links && !endings) { get_out = true; break; } // Image links must be treated specially, since they may contain nested links // in the caption! var initial = null; // If set, it's 'image:' or 'file:' and we have an image link i = next; while (i < s.length && s.charAt(i) === ch) i++; if (ch === '\x03' && i < s.length && s.charAt(i) === '[') i++; initial = get_initial(i, s); // Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |} var lk_text = sanitize(s.substring(i), false, // No links at top-level allowed caption_level + 1, false, // No thumbs true, // Break at pipe false, // No tables false); // No galleries var lk_text_length = consumed[caption_level]; j = i + lk_text_length; if (j >= s.length) { // Used up the whole text: [[Foo or [bar if (initial && allow_only_thumbs) { // Should in any case have started with [[, not [ result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length) + '\x04'; } else { result += s.substring(0, i) + lk_text + ((s.charAt(i - 1) === '[') ? ']' : '\x04'); } s = ''; break; } if (s.charAt(j) === '|') k = j; else k = -1; if (k < 0) { // No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar] if (initial && allow_only_thumbs) { // Should in any case have started with [[, not [ result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length) + '\x04'; } else { result += s.substring(0, i) + lk_text + ((s.charAt(i - 1) === '[') ? ']' : '\x04'); } if (s.charAt(j) === ']' || s.charAt(j) === '\x04') { // Indeed closing the link s = s.substring(j + 1); } else { s = s.substring(j); } break; } else { var caption = null; var used = 0; // Pipe found. if (!initial) { // Not an image link. Must be something like [[Foo|Bar]]. caption = sanitize( s.substring(k + 1), // No links, please false, caption_level + 1, // No thumbs either false, // Don't care about pipes false, // Allow tables (yes, parser allows that!) true, // Allow galleries (?) true); // Now we're at [[, [, ]], or ] used = consumed[caption_level]; result += s.substring(0, i) + lk_text + '|' + caption + ((s.charAt(i - 1) === '[') ? ']' : '\x04'); } else { var q = s.substring(k); // We assume that there are no templates, nowikis, and other nasty things // in the parameters. Search forward until the next [, {, ], } var l = q.search(/[\x01\x02\x03[\x04\]{}\x05\x06\x07\x08]/); if (l < 0) l = q.length; if (l + 1 < q.length) q = q.substring(0, l + 1); var is_thumb = q.search(/\|\s*thumb(nail)?\s*[|\x04]/) >= 0; var img_width = /\|\s*(\d+)px\s*[|\x04]/.exec(q); if (img_width && img_width.length > 1) { img_width = parseInt(img_width[1], 10); if (isNaN(img_width)) img_width = null; } else { img_width = null; } if (!img_width) img_width = is_thumb ? 180 : 301; var is_small = img_width <= 300; // Caption starts at the last pipe before l. If that is a parameter, // it doesn't hurt. var m = k + q.lastIndexOf('|', l); caption = sanitize( s.substring(m + 1), // Allow links only if it's a thumb is_thumb, caption_level + 1, allow_thumbs && is_thumb, // Don't break at pipe false, // Tables only if it's a thumb is_thumb, // Allow galleries for thumbs (?) is_thumb); used = consumed[caption_level]; // caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04' is_thumb = allow_thumbs && is_small; if (is_thumb || !allow_only_thumbs) { result += s.substring(0, i - 1) + '\x03' + lk_text; } else { result += s.substring(0, i - 1) + '\x03:' + initial + lk_text.substring(initial.length); } result += s.substring(k, m + 1) + caption + '\x04'; k = m; } next = k + 1 + used; if (next < s.length) { if (s.charAt(next) !== '\x04') s = s.substring(next); else s = s.substring(next + 1); } else { s = ''; } } break; } case '\x04': case ']': // Extra bracket. result += s.substring(0, next); if (!caption_level && !break_at_pipe) { result += (ch === ']' ? ']' : ']]'); s = s.substring(next + 1); } else { get_out = true; } break; case '|': result += s.substring(0, next); if (break_at_pipe && !endings) { // Pipe character at top level get_out = true; } else { if (!caption_level && !break_at_pipe && !endings) result += '|'; // Top-level pipe character else result += '|'; s = s.substring(next + 1); } break; } // end switch } // end while if (in_nowiki) result += ''; // Make sure this nowiki is closed. // Close open templates and tables while (endings) { ch = pop_end(); result += (ch === '\x06' ? '\n' : '') + ch; } if (caption_level > 0) { var used_up = initial_length - (get_out ? (s.length - next) : 0); if (consumed.length < caption_level) consumed.push(used_up); else consumed[caption_level - 1] = used_up; } return result; } // Replace multi-character tokens by one-character placeholders, simplifying the // subsequent processing. var s = input.replace(/\{\{/g, '\x01') .replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end .replace(/\}\}/g, '\x02') .replace(/\[\[/g, '\x03') .replace(/\]\]/g, '\x04') .replace(/\n\s*\{\|/g, '\n\x05') // Table start and end must be on own line .replace(/^\s*\{\|/, '\x05') // Table start at the very beginning .replace(/\n\s*\|\}/g, '\n\x06') // (we strip leading whitespace) .replace(/<\s*gallery\s*>/g, '\x07') .replace(/<\/\s*gallery\s*>/g, '\x08'); s = sanitize(s, true, 0, true, false, true, true); // with links, allow thumbs, don't break at pipe, allow tables, allow galleries return s.replace(/\x01/g, '{{') .replace(/\x02/g, '}}') .replace(/\x03/g, '[[') .replace(/\x04/g, ']]') .replace(/\x05/g, '{|') .replace(/\x06/g, '|}') .replace(/\x07/g, '') .replace(/\x08/g, ''); } }; }()); //