From 40d215d8bf746a02977ebbc4138138e2e1800ea9 Mon Sep 17 00:00:00 2001 From: Andrew Dolgov Date: Mon, 14 Sep 2020 14:08:20 +0300 Subject: experimental: add optional soft hyphenation --- lib/hyphen/hyphen.js | 458 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 458 insertions(+) create mode 100644 lib/hyphen/hyphen.js (limited to 'lib/hyphen/hyphen.js') diff --git a/lib/hyphen/hyphen.js b/lib/hyphen/hyphen.js new file mode 100644 index 0000000..d5c2df3 --- /dev/null +++ b/lib/hyphen/hyphen.js @@ -0,0 +1,458 @@ +/** Text hyphenation in Javascript. + * Copyright (C) 2020 Yevhen Tiurin (yevhentiurin@gmail.com) + * https://github.com/ytiurin/hyphen + * + * Released under the ISC license + * https://github.com/ytiurin/hyphen/blob/master/LICENSE + */ +(function (root, factory) { + if (typeof define === "function" && define.amd) { + // AMD. Register as an anonymous module. + define([], factory); + } else if (typeof module === "object" && module.exports) { + // Node. Does not work with strict CommonJS, but + // only CommonJS-like environments that support module.exports, + // like Node. + module.exports = factory(); + } else { + // Browser globals (root is window) + root.createHyphenator = factory(); + } +})(this, function () { + var MIN_WORD_LENGTH_BOUNDRY = 5; + + var SETTING_DEFAULT_ASYNC = false, + SETTING_DEFAULT_DEBUG = false, + SETTING_DEFAULT_HTML = false, + SETTING_DEFAULT_HYPH_CHAR = "\u00AD", + SETTING_NAME_ASYNC = "async", + SETTING_NAME_DEBUG = "debug", + SETTING_NAME_HTML = "html", + SETTING_NAME_HYPH_CHAR = "hyphenChar", + SETTING_NAME_MIN_WORD_LENGTH = "minWordLength"; + + var _global = + typeof global === "object" + ? global + : typeof window === "object" + ? window + : typeof this === "object" + ? this + : {}; + + function cloneObj(source) { + var target = {}; + for (var key in source) { + target[key] = source[key]; + } + return target; + } + + function keyOrDefault(object, key, defaultValue) { + if (key in object) { + return object[key]; + } + return defaultValue; + } + + function exceptionsFromDefinition(patternsDefinition, hyphenChar) { + return patternsDefinition.exceptions.reduce(function ( + exceptions, + exception + ) { + exceptions[exception.replace(/\-/g, "")] = exception.replace( + /\-/g, + hyphenChar + ); + return exceptions; + }, + {}); + } + + function createHyphenator(patternsDefinition, options) { + options = options || {}; + var // + asyncMode = keyOrDefault( + options, + SETTING_NAME_ASYNC, + SETTING_DEFAULT_ASYNC + ), + caches = {}, + debug = keyOrDefault(options, SETTING_NAME_DEBUG, SETTING_DEFAULT_DEBUG), + exceptions = {}, + hyphenChar = keyOrDefault( + options, + SETTING_NAME_HYPH_CHAR, + SETTING_DEFAULT_HYPH_CHAR + ), + patterns = patternsDefinition.patterns.map(preprocessPattern), + minWordLength = Math.max( + options[SETTING_NAME_MIN_WORD_LENGTH] >> 0, + MIN_WORD_LENGTH_BOUNDRY + ), + skipHTML = keyOrDefault(options, SETTING_NAME_HTML, SETTING_DEFAULT_HTML); + + // Prepare cache + var cacheKey = hyphenChar + minWordLength; + exceptions[cacheKey] = exceptionsFromDefinition( + patternsDefinition, + hyphenChar + ); + caches[cacheKey] = cloneObj(exceptions[cacheKey]); + + if (asyncMode && !("Promise" in _global)) { + throw new Error( + "Failed to create hyphenator: Could not find global Promise object, needed for hyphenator to work in async mode" + ); + } + + return function (text, options) { + options = options || {}; + + var localDebug = keyOrDefault(options, SETTING_NAME_DEBUG, debug), + localHyphenChar = keyOrDefault( + options, + SETTING_NAME_HYPH_CHAR, + hyphenChar + ), + localMinWordLength = Math.max( + options[SETTING_NAME_MIN_WORD_LENGTH] >> 0, + minWordLength + ), + cacheKey = localHyphenChar + localMinWordLength; + + if (!exceptions[cacheKey]) { + exceptions[cacheKey] = exceptionsFromDefinition( + patternsDefinition, + localHyphenChar + ); + } + + if (!caches[cacheKey]) { + caches[cacheKey] = cloneObj(exceptions[cacheKey]); + } + + return start( + text, + patterns, + caches[cacheKey], + localDebug, + localHyphenChar, + skipHTML, + localMinWordLength, + asyncMode + ); + }; + } + function createTextChunkReader(text, hyphenChar, skipHTML, minWordLength) { + function readNextTextChunk() { + var nextTextChunk = ""; + + shouldHyphenate = void 0; + + chunkReader: while (nextCharIndex <= text.length) { + var // + nextChar = text.charAt(nextCharIndex++), + charIsLetter = + !!nextChar && !/\s|[\!-\@\[-\`\{-\~\u2013-\u203C]/.test(nextChar), + charIsAngleOpen = nextChar === "<", + charIsAngleClose = nextChar === ">", + charIsHyphen = nextChar === hyphenChar; + + do { + if (state === STATE_READ_TAG) { + if (charIsAngleClose) { + state = STATE_RETURN_UNTOUCHED; + } + break; + } + + if (charIsHyphen) { + shouldHyphenate = SHOULD_SKIP; + state = STATE_READ_WORD; + break; + } + + if (charIsLetter) { + state = STATE_READ_WORD; + break; + } + + if (state === STATE_READ_WORD) { + state = STATE_RETURN_WORD; + shouldHyphenate = + shouldHyphenate || + (nextTextChunk.length >= minWordLength && SHOULD_HYPHENATE); + break; + } + + shouldHyphenate = SHOULD_SKIP; + state = STATE_RETURN_UNTOUCHED; + } while (0); + + if ( + charIsAngleOpen && + state !== STATE_RETURN_WORD && + skipHTML && + !isSpacelike(text.charAt(nextCharIndex)) + ) { + shouldHyphenate = SHOULD_SKIP; + state = STATE_READ_TAG; + } + + switch (state) { + case STATE_READ_TAG: + nextTextChunk += nextChar; + break; + + case STATE_READ_WORD: + nextTextChunk += nextChar; + break; + + case STATE_RETURN_UNTOUCHED: + nextTextChunk += nextChar; + break chunkReader; + + case STATE_RETURN_WORD: + nextCharIndex--; + break chunkReader; + } + } + return nextTextChunk || void 0; + } + + function shouldNextHyphenate() { + return shouldHyphenate === SHOULD_HYPHENATE; + } + + var isSpacelike = RegExp.prototype.test.bind(/\s/); + + var // + nextCharIndex = 0, + SHOULD_HYPHENATE = 1, + SHOULD_SKIP = 2, + shouldHyphenate, + STATE_READ_TAG = 1, + STATE_READ_WORD = 2, + STATE_RETURN_UNTOUCHED = 3, + STATE_RETURN_WORD = 4, + state; + + return [readNextTextChunk, shouldNextHyphenate]; + } + function hyphenateWord(text, patterns, debug, hyphenChar) { + var // + levels = new Array(text.length + 1), + loweredText = text.toLocaleLowerCase(), + p = [], + patternData, + patternIndex = 0; + + for (var i = levels.length; i--; ) levels[i] = 0; + + while ((patternData = patterns[patternIndex++])) { + var // + fromChar = 0, + endPattern = false; + while (!endPattern) { + var // + patternEntityIndex = loweredText.indexOf(patternData.text, fromChar), + patternFits = + patternEntityIndex > -1 && + (patternData.stickToLeft ? patternEntityIndex === 0 : true) && + (patternData.stickToRight + ? patternEntityIndex + patternData.text.length === text.length + : true); + + if (patternFits) { + p.push(patternData.pattern + ">" + patternData.levels.join("")); + + for (var i = 0; i < patternData.levels.length; i++) + levels[patternEntityIndex + i] = Math.max( + patternData.levels[i], + levels[patternEntityIndex + i] + ); + } + if (patternEntityIndex > -1 && patternData.text.length > 0) { + fromChar = patternEntityIndex + patternData.text.length + 1; + } else { + endPattern = true; + } + } + } + + levels[0] = levels[1] = levels[levels.length - 1] = levels[ + levels.length - 2 + ] = 0; + + var // + hyphenatedText = "", + leveledText = "", + debugHyphenatedText = ""; + + for (var i = 0; i < levels.length; i++) { + hyphenatedText += + (levels[i] % 2 === 1 ? hyphenChar : "") + text.charAt(i); + debugHyphenatedText += (levels[i] % 2 === 1 ? "-" : "") + text.charAt(i); + leveledText += (levels[i] > 0 ? levels[i] : "") + text.charAt(i); + } + + if (debug) + console.log.apply( + console, + [text, "->"] + .concat(p) + .concat(["->"]) + .concat(levels) + .concat(["->", leveledText]) + .concat(["->", debugHyphenatedText]) + ); + + return hyphenatedText; + } + function preprocessPattern(pattern) { + var // + patternCharIndex = 0, + patternChar, + patternData = { + pattern: pattern, + text: "", + levels: [], + stickToLeft: 0, + stickToRight: 0 + }, + states = { alphabet: 1, level: 2, stickToLeft: 3, stickToRight: 4 }; + + while ((patternChar = pattern.charAt(patternCharIndex++))) { + var // + charIsDot = patternChar === ".", + charIsNumber = !charIsDot && /\d/.test(patternChar), + state = charIsDot + ? patternCharIndex - 1 === 0 + ? states.stickToLeft + : states.stickToRight + : charIsNumber + ? states.level + : states.alphabet; + + switch (state) { + case states.alphabet: + !prevCharIsNumber && patternData.levels.push(0); + patternData.text += patternChar; + break; + + case states.level: + patternData.levels.push(parseInt(patternChar)); + break; + + case states.stickToLeft: + patternData.stickToLeft = true; + break; + + case states.stickToRight: + patternData.stickToRight = true; + break; + } + + var prevCharIsNumber = charIsNumber; + } + + return patternData; + } + function start( + text, + patterns, + cache, + debug, + hyphenChar, + skipHTML, + minWordLength, + isAsync + ) { + function done() { + allTime = new Date() - allTime; + resolveNewText(newText); + + if (debug) { + console.log( + "----------------\nHyphenation stats: " + + processedN + + " text chunks processed, " + + hyphenatedN + + " words hyphenated" + ); + console.log("Work time: " + workTime / 1000); + console.log("Wait time: " + (allTime - workTime) / 1000); + console.log("All time: " + allTime / 1000); + } + } + + var // + newText = "", + nextTextChunk, + reader = createTextChunkReader(text, hyphenChar, skipHTML, minWordLength), + readNextTextChunk = reader[0], + shouldNextHyphenate = reader[1], + states = { hyphenateWord: 1, concatenate: 2 }, + processedN = 0, + hyphenatedN = 0; + + var // + allTime = new Date(), + workTime = 0; + + var resolveNewText = function () {}; + + function nextTick() { + var loopStart = new Date(); + + while ( + (!isAsync || new Date() - loopStart < 10) && + (nextTextChunk = readNextTextChunk()) + ) { + var state = shouldNextHyphenate() + ? states.hyphenateWord + : states.concatenate; + + switch (state) { + case states.hyphenateWord: + if (!cache[nextTextChunk]) + cache[nextTextChunk] = hyphenateWord( + nextTextChunk, + patterns, + debug, + hyphenChar + ); + + if (nextTextChunk !== cache[nextTextChunk]) hyphenatedN++; + + nextTextChunk = cache[nextTextChunk]; + + case states.concatenate: + newText += nextTextChunk; + } + + processedN++; + } + workTime += new Date() - loopStart; + + if (!nextTextChunk) { + done(); + } else { + setTimeout(nextTick); + } + } + + if (isAsync) { + setTimeout(nextTick); + return new Promise(function (resolve) { + resolveNewText = resolve; + }); + } else { + nextTick(); + return newText; + } + } + + return createHyphenator; +}); -- cgit v1.2.3