/**
Wikitext sanitation for MediaWiki
Author: [[User:Lupo]], January 2008
License: Quadruple licensed GFDL, GPL, LGPL and Creative Commons Attribution 3.0 (CC-BY-3.0)
Choose whichever license of these you like best :-)
*/
//
/* global mw:false, TextCleaner:true */
/* eslint no-control-regex:0, one-var:0, vars-on-top:0, camelcase:0, curly:0, space-in-parens:0, computed-property-spacing:0, array-bracket-spacing:0 */
/* jshint curly:false, eqnull:true, laxbreak:true */
(function () {
'use strict';
window.TextCleaner = {
imgNamespaceNames: null,
// This function attempts to construct well-formed wikitext from input that may contain
// possibly broken wikitext.
//
// Note: even just a half-baked sanitation of wikitext is hyper-complex due to the presence
// of templates, and due to the fact that image thumbnail captions may themselves contain
// links. This implementation catches the most common errors (such as forgetting to close a
// template or a link), and even some more elaborate ones. With enough malice, this sanitation
// can still be broken by user input such that the result is not well-formed wikitext as the
// parser at the servers would like to have it. (It's still possible that the result is broken
// wikitext, if the input was broken wikitext. But it never transforms well-formed wikitext
// into broken wikitext.)
//
// If 'only_thumbs' is true, all [[Image: links are changed to [[:Image:, unless the original
// image link was a thumbnail or had a width smaller than 300px specified.
//
// WARNING: do *not* attempt to use this to process large texts (e.g., a whole article). It is
// probably rather inefficient due to the many substrings that are generated. This function is
// primarily intended to be used to clean up user input in forms, which are typically rather
// short.
sanitizeWikiText: function (input, only_thumbs) {
if (input.search(/[\][}{]|]*)?>|', next + 3);
if (i < 0) {
result += s + '-->';
s = '';
} else {
result += s.substring(0, i + 3);
s = s.substring(i + 3);
}
} else if (s.charAt(next + 1) === 'n') {
// Nowiki may contain HTML comments!
in_nowiki = true;
regexp = nowiki_regexp;
result += s.substring(0, next + 7);
s = s.substring(next + 7);
} else {
// End of nowiki. Searched for and found only if in_nowiki === true
in_nowiki = false;
regexp = base_regexp;
i = s.indexOf('>', next + 1); // End of tag
result += s.substring(0, i + 1);
s = s.substring(i + 1);
}
break;
case '\x05':
// Table start
if (!with_tables) {
result += s.substring(0, next);
get_out = true;
break;
}
/* fall through */
case '\x07':
if (ch === '\x07' && !with_galleries) {
result += s.substring(0, next);
get_out = true;
break;
}
/* fall through */
case '\x01':
// Start of template, table, or gallery
result += s.substring(0, next + 1);
push_end(String.fromCharCode(ch.charCodeAt(0) + 1).charAt(0));
s = s.substring(next + 1);
break;
case '\x06':
// Table end
if (break_at_pipe && !endings) {
result += s.substring(0, next);
get_out = true;
break;
}
/* fall through */
case '\x02':
// End of a template or table
result += s.substring(0, next);
if (!endings || endings[endings.length - 1] !== ch) {
// Spurious template or table end
if (ch === '\x02')
result += '}}';
else
result += '|}';
} else {
result += pop_end();
}
s = s.substring(next + 1);
break;
case '\x08':
// End of gallery
result += s.substring(0, next + 1);
if (endings && endings[endings.length - 1] === ch)
pop_end();
s = s.substring(next + 1);
break;
case '\x03':
case '[': {
if (!with_links && !endings) {
get_out = true;
break;
}
// Image links must be treated specially, since they may contain nested links
// in the caption!
var initial = null; // If set, it's 'image:' or 'file:' and we have an image link
i = next;
while (i < s.length && s.charAt(i) === ch)
i++;
if (ch === '\x03' && i < s.length && s.charAt(i) === '[')
i++;
initial = get_initial(i, s);
// Scan ahead. We'll break at the next top-level | or ] or ]] or [ or [[ or {| or |}
var lk_text = sanitize(s.substring(i),
false, // No links at top-level allowed
caption_level + 1,
false, // No thumbs
true, // Break at pipe
false, // No tables
false); // No galleries
var lk_text_length = consumed[caption_level];
j = i + lk_text_length;
if (j >= s.length) {
// Used up the whole text: [[Foo or [bar
if (initial && allow_only_thumbs) {
// Should in any case have started with [[, not [
result += s.substring(0, i - 1) + '\x03:' + initial +
lk_text.substring(initial.length) + '\x04';
} else {
result += s.substring(0, i) + lk_text +
((s.charAt(i - 1) === '[') ? ']' : '\x04');
}
s = '';
break;
}
if (s.charAt(j) === '|')
k = j;
else
k = -1;
if (k < 0) {
// No pipe found: we should be on the closing ]] or ] or [[Foo]] or [bar]
if (initial && allow_only_thumbs) {
// Should in any case have started with [[, not [
result += s.substring(0, i - 1) + '\x03:' + initial +
lk_text.substring(initial.length) + '\x04';
} else {
result += s.substring(0, i) + lk_text +
((s.charAt(i - 1) === '[') ? ']' : '\x04');
}
if (s.charAt(j) === ']' || s.charAt(j) === '\x04') {
// Indeed closing the link
s = s.substring(j + 1);
} else {
s = s.substring(j);
}
break;
} else {
var caption = null;
var used = 0;
// Pipe found.
if (!initial) {
// Not an image link. Must be something like [[Foo|Bar]].
caption = sanitize(
s.substring(k + 1),
// No links, please
false,
caption_level + 1,
// No thumbs either
false,
// Don't care about pipes
false,
// Allow tables (yes, parser allows that!)
true,
// Allow galleries (?)
true);
// Now we're at [[, [, ]], or ]
used = consumed[caption_level];
result += s.substring(0, i) + lk_text + '|' + caption +
((s.charAt(i - 1) === '[') ? ']' : '\x04');
} else {
var q = s.substring(k);
// We assume that there are no templates, nowikis, and other nasty things
// in the parameters. Search forward until the next [, {, ], }
var l = q.search(/[\x01\x02\x03[\x04\]{}\x05\x06\x07\x08]/);
if (l < 0)
l = q.length;
if (l + 1 < q.length)
q = q.substring(0, l + 1);
var is_thumb = q.search(/\|\s*thumb(nail)?\s*[|\x04]/) >= 0;
var img_width = /\|\s*(\d+)px\s*[|\x04]/.exec(q);
if (img_width && img_width.length > 1) {
img_width = parseInt(img_width[1], 10);
if (isNaN(img_width))
img_width = null;
} else {
img_width = null;
}
if (!img_width)
img_width = is_thumb ? 180 : 301;
var is_small = img_width <= 300;
// Caption starts at the last pipe before l. If that is a parameter,
// it doesn't hurt.
var m = k + q.lastIndexOf('|', l);
caption = sanitize(
s.substring(m + 1),
// Allow links only if it's a thumb
is_thumb,
caption_level + 1,
allow_thumbs && is_thumb,
// Don't break at pipe
false,
// Tables only if it's a thumb
is_thumb,
// Allow galleries for thumbs (?)
is_thumb);
used = consumed[caption_level];
// caption used 'used' chars from m+1, s.charAt(m+1+used) === '\x04'
is_thumb = allow_thumbs && is_small;
if (is_thumb || !allow_only_thumbs) {
result += s.substring(0, i - 1) + '\x03' + lk_text;
} else {
result += s.substring(0, i - 1) + '\x03:' + initial +
lk_text.substring(initial.length);
}
result += s.substring(k, m + 1) + caption + '\x04';
k = m;
}
next = k + 1 + used;
if (next < s.length) {
if (s.charAt(next) !== '\x04')
s = s.substring(next);
else
s = s.substring(next + 1);
} else {
s = '';
}
}
break;
}
case '\x04':
case ']':
// Extra bracket.
result += s.substring(0, next);
if (!caption_level && !break_at_pipe) {
result += (ch === ']' ? ']' : ']]');
s = s.substring(next + 1);
} else {
get_out = true;
}
break;
case '|':
result += s.substring(0, next);
if (break_at_pipe && !endings) {
// Pipe character at top level
get_out = true;
} else {
if (!caption_level && !break_at_pipe && !endings)
result += '|'; // Top-level pipe character
else
result += '|';
s = s.substring(next + 1);
}
break;
} // end switch
} // end while
if (in_nowiki)
result += ''; // Make sure this nowiki is closed.
// Close open templates and tables
while (endings) {
ch = pop_end();
result += (ch === '\x06' ? '\n' : '') + ch;
}
if (caption_level > 0) {
var used_up = initial_length - (get_out ? (s.length - next) : 0);
if (consumed.length < caption_level)
consumed.push(used_up);
else
consumed[caption_level - 1] = used_up;
}
return result;
}
// Replace multi-character tokens by one-character placeholders, simplifying the
// subsequent processing.
var s = input.replace(/\{\{/g, '\x01')
.replace(/\n\s*\|\}\}\}/g, '\n\x06\x02') // Table end + template end
.replace(/\}\}/g, '\x02')
.replace(/\[\[/g, '\x03')
.replace(/\]\]/g, '\x04')
.replace(/\n\s*\{\|/g, '\n\x05') // Table start and end must be on own line
.replace(/^\s*\{\|/, '\x05') // Table start at the very beginning
.replace(/\n\s*\|\}/g, '\n\x06') // (we strip leading whitespace)
.replace(/<\s*gallery\s*>/g, '\x07')
.replace(/<\/\s*gallery\s*>/g, '\x08');
s = sanitize(s, true, 0, true, false, true, true);
// with links, allow thumbs, don't break at pipe, allow tables, allow galleries
return s.replace(/\x01/g, '{{')
.replace(/\x02/g, '}}')
.replace(/\x03/g, '[[')
.replace(/\x04/g, ']]')
.replace(/\x05/g, '{|')
.replace(/\x06/g, '|}')
.replace(/\x07/g, '')
.replace(/\x08/g, '');
}
};
}());
//