var assert = require('assert'), Stream = require('stream'), inherits = require('util').inherits; /* * This filter consumes a stream of characters and emits one string per line. */ function LineSplitter() { var self = this, buffer = ""; Stream.call(this); this.writable = true; this.write = function(data) { var lines = (buffer + data).split(/\r\n|\n\r|\n|\r/); for (var i = 0; i < lines.length - 1; i++) { self.emit('data', lines[i]); } buffer = lines[lines.length - 1]; return true; }; this.end = function(data) { this.write(data || ''); if (buffer) { self.emit('data', buffer); } self.emit('end'); }; } inherits(LineSplitter, Stream); /* * This filter consumes lines and emits paragraph objects. */ function ParagraphParser() { var self = this, block_is_license_block = false, block_has_c_style_comment, is_first_line_in_paragraph, paragraph_line_indent, paragraph; Stream.call(this); this.writable = true; resetBlock(false); this.write = function(data) { parseLine(data + ''); return true; }; this.end = function(data) { if (data) { parseLine(data + ''); } flushParagraph(); self.emit('end'); }; function resetParagraph() { is_first_line_in_paragraph = true; paragraph_line_indent = -1; paragraph = { li: '', in_license_block: block_is_license_block, lines: [] }; } function resetBlock(is_license_block) { block_is_license_block = is_license_block; block_has_c_style_comment = false; resetParagraph(); } function flushParagraph() { if (paragraph.lines.length || paragraph.li) { self.emit('data', paragraph); } resetParagraph(); } function parseLine(line) { // Strip trailing whitespace line = line.replace(/\s*$/, ''); // Detect block separator if (/^\s*(=|"){3,}\s*$/.test(line)) { flushParagraph(); resetBlock(!block_is_license_block); return; } // Strip comments around block if (block_is_license_block) { if (!block_has_c_style_comment) block_has_c_style_comment = /^\s*(\/\*)/.test(line); if (block_has_c_style_comment) { var prev = line; line = line.replace(/^(\s*?)(?:\s?\*\/|\/\*\s|\s\*\s?)/, '$1'); if (prev == line) line = line.replace(/^\s{2}/, ''); if (/\*\//.test(prev)) block_has_c_style_comment = false; } else { // Strip C++ and perl style comments. line = line.replace(/^(\s*)(?:\/\/\s?|#\s?)/, '$1'); } } // Detect blank line (paragraph separator) if (!/\S/.test(line)) { flushParagraph(); return; } // Detect separator "lines" within a block. These mark a paragraph break // and are stripped from the output. if (/^\s*[=*\-]{5,}\s*$/.test(line)) { flushParagraph(); return; } // Find out indentation level and the start of a lied or numbered list; var result = /^(\s*)(\d+\.|\*|-)?\s*/.exec(line); assert.ok(result); // The number of characters that will be stripped from the beginning of // the line. var line_strip_length = result[0].length; // The indentation size that will be used to detect indentation jumps. // Fudge by 1 space. var line_indent = Math.floor(result[0].length / 2) * 2; // The indentation level that will be exported var level = Math.floor(result[1].length / 2); // The list indicator that precedes the actual content, if any. var line_li = result[2]; // Flush the paragraph when there is a li or an indentation jump if (line_li || (line_indent != paragraph_line_indent && paragraph_line_indent != -1)) { flushParagraph(); paragraph.li = line_li; } // Set the paragraph indent that we use to detect indentation jumps. When // we just detected a list indicator, wait // for the next line to arrive before setting this. if (!line_li && paragraph_line_indent != -1) { paragraph_line_indent = line_indent; } // Set the output indent level if it has not been set yet. if (paragraph.level === undefined) paragraph.level = level; // Strip leading whitespace and li. line = line.slice(line_strip_length); if (line) paragraph.lines.push(line); is_first_line_in_paragraph = false; } } inherits(ParagraphParser, Stream); /* * This filter consumes paragraph objects and emits modified paragraph objects. * The lines within the paragraph are unwrapped where appropriate. */ function Unwrapper() { var self = this; Stream.call(this); this.writable = true; this.write = function(paragraph) { var lines = paragraph.lines, break_after = [], i; for (i = 0; i < lines.length - 1; i++) { var line = lines[i]; // When a line is really short, the line was probably kept separate for a // reason. if (line.length < 50) { // If the first word on the next line really didn't fit after the line, // it probably was just ordinary wrapping after all. var next_first_word_length = lines[i + 1].replace(/\s.*$/, '').length; if (line.length + next_first_word_length < 60) { break_after[i] = true; } } } for (i = 0; i < lines.length - 1; ) { if (!break_after[i]) { lines[i] += ' ' + lines.splice(i + 1, 1)[0]; } else { i++; } } self.emit('data', paragraph); }; this.end = function(data) { if (data) self.write(data); self.emit('end'); }; } inherits(Unwrapper, Stream); /* * This filter generates an rtf document from a stream of paragraph objects. */ function RtfGenerator() { var self = this, did_write_anything = false; Stream.call(this); this.writable = true; this.write = function(paragraph) { if (!did_write_anything) { emitHeader(); did_write_anything = true; } var li = paragraph.li, level = paragraph.level + (li ? 1 : 0), lic = paragraph.in_license_block; var rtf = "\\pard"; rtf += '\\sa150\\sl300\\slmult1'; if (level > 0) rtf += '\\li' + (level * 240); if (li) { rtf += '\\tx' + (level) * 240; rtf += '\\fi-240'; } if (lic) rtf += '\\ri240'; if (!lic) rtf += '\\b'; if (li) rtf += ' ' + li + '\\tab'; rtf += ' '; rtf += paragraph.lines.map(rtfEscape).join('\\line '); if (!lic) rtf += '\\b0'; rtf += '\\par\n'; self.emit('data', rtf); }; this.end = function(data) { if (data) self.write(data); if (did_write_anything) emitFooter(); self.emit('end'); }; function toHex(number, length) { var hex = (~~number).toString(16); while (hex.length < length) hex = '0' + hex; return hex; } function rtfEscape(string) { return string .replace(/[\\\{\}]/g, function(m) { return '\\' + m; }) .replace(/\t/g, function() { return '\\tab '; }) .replace(/[\x00-\x1f\x7f-\xff]/g, function(m) { return '\\\'' + toHex(m.charCodeAt(0), 2); }) .replace(/\ufeff/g, '') .replace(/[\u0100-\uffff]/g, function(m) { return '\\u' + toHex(m.charCodeAt(0), 4) + '?'; }); } function emitHeader() { self.emit('data', '{\\rtf1\\ansi\\ansicpg1252\\uc1\\deff0\\deflang1033' + '{\\fonttbl{\\f0\\fswiss\\fcharset0 Tahoma;}}\\fs20\n' + '{\\*\\generator txt2rtf 0.0.1;}\n'); } function emitFooter() { self.emit('data', '}'); } } inherits(RtfGenerator, Stream); var stdin = process.stdin, stdout = process.stdout, line_splitter = new LineSplitter(), paragraph_parser = new ParagraphParser(), unwrapper = new Unwrapper(), rtf_generator = new RtfGenerator(); stdin.setEncoding('utf-8'); stdin.resume(); stdin.pipe(line_splitter); line_splitter.pipe(paragraph_parser); paragraph_parser.pipe(unwrapper); unwrapper.pipe(rtf_generator); rtf_generator.pipe(stdout);