205 lines
7.5 KiB
Go
205 lines
7.5 KiB
Go
package uniseg
|
||
|
||
import "unicode/utf8"
|
||
|
||
// The states of the sentence break parser.
|
||
const (
|
||
sbAny = iota
|
||
sbCR
|
||
sbParaSep
|
||
sbATerm
|
||
sbUpper
|
||
sbLower
|
||
sbSB7
|
||
sbSB8Close
|
||
sbSB8Sp
|
||
sbSTerm
|
||
sbSB8aClose
|
||
sbSB8aSp
|
||
)
|
||
|
||
// The sentence break parser's breaking instructions.
|
||
const (
|
||
sbDontBreak = iota
|
||
sbBreak
|
||
)
|
||
|
||
// The sentence break parser's state transitions. It's anologous to
|
||
// grTransitions, see comments there for details. Unicode version 14.0.0.
|
||
var sbTransitions = map[[2]int][3]int{
|
||
// SB3.
|
||
{sbAny, prCR}: {sbCR, sbDontBreak, 9990},
|
||
{sbCR, prLF}: {sbParaSep, sbDontBreak, 30},
|
||
|
||
// SB4.
|
||
{sbAny, prSep}: {sbParaSep, sbDontBreak, 9990},
|
||
{sbAny, prLF}: {sbParaSep, sbDontBreak, 9990},
|
||
{sbParaSep, prAny}: {sbAny, sbBreak, 40},
|
||
{sbCR, prAny}: {sbAny, sbBreak, 40},
|
||
|
||
// SB6.
|
||
{sbAny, prATerm}: {sbATerm, sbDontBreak, 9990},
|
||
{sbATerm, prNumeric}: {sbAny, sbDontBreak, 60},
|
||
{sbSB7, prNumeric}: {sbAny, sbDontBreak, 60}, // Because ATerm also appears in SB7.
|
||
|
||
// SB7.
|
||
{sbAny, prUpper}: {sbUpper, sbDontBreak, 9990},
|
||
{sbAny, prLower}: {sbLower, sbDontBreak, 9990},
|
||
{sbUpper, prATerm}: {sbSB7, sbDontBreak, 70},
|
||
{sbLower, prATerm}: {sbSB7, sbDontBreak, 70},
|
||
{sbSB7, prUpper}: {sbUpper, sbDontBreak, 70},
|
||
|
||
// SB8a.
|
||
{sbAny, prSTerm}: {sbSTerm, sbDontBreak, 9990},
|
||
{sbATerm, prSContinue}: {sbAny, sbDontBreak, 81},
|
||
{sbATerm, prATerm}: {sbATerm, sbDontBreak, 81},
|
||
{sbATerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
||
{sbSB7, prSContinue}: {sbAny, sbDontBreak, 81},
|
||
{sbSB7, prATerm}: {sbATerm, sbDontBreak, 81},
|
||
{sbSB7, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
||
{sbSB8Close, prSContinue}: {sbAny, sbDontBreak, 81},
|
||
{sbSB8Close, prATerm}: {sbATerm, sbDontBreak, 81},
|
||
{sbSB8Close, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
||
{sbSB8Sp, prSContinue}: {sbAny, sbDontBreak, 81},
|
||
{sbSB8Sp, prATerm}: {sbATerm, sbDontBreak, 81},
|
||
{sbSB8Sp, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
||
{sbSTerm, prSContinue}: {sbAny, sbDontBreak, 81},
|
||
{sbSTerm, prATerm}: {sbATerm, sbDontBreak, 81},
|
||
{sbSTerm, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
||
{sbSB8aClose, prSContinue}: {sbAny, sbDontBreak, 81},
|
||
{sbSB8aClose, prATerm}: {sbATerm, sbDontBreak, 81},
|
||
{sbSB8aClose, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
||
{sbSB8aSp, prSContinue}: {sbAny, sbDontBreak, 81},
|
||
{sbSB8aSp, prATerm}: {sbATerm, sbDontBreak, 81},
|
||
{sbSB8aSp, prSTerm}: {sbSTerm, sbDontBreak, 81},
|
||
|
||
// SB9.
|
||
{sbATerm, prClose}: {sbSB8Close, sbDontBreak, 90},
|
||
{sbSB7, prClose}: {sbSB8Close, sbDontBreak, 90},
|
||
{sbSB8Close, prClose}: {sbSB8Close, sbDontBreak, 90},
|
||
{sbATerm, prSp}: {sbSB8Sp, sbDontBreak, 90},
|
||
{sbSB7, prSp}: {sbSB8Sp, sbDontBreak, 90},
|
||
{sbSB8Close, prSp}: {sbSB8Sp, sbDontBreak, 90},
|
||
{sbSTerm, prClose}: {sbSB8aClose, sbDontBreak, 90},
|
||
{sbSB8aClose, prClose}: {sbSB8aClose, sbDontBreak, 90},
|
||
{sbSTerm, prSp}: {sbSB8aSp, sbDontBreak, 90},
|
||
{sbSB8aClose, prSp}: {sbSB8aSp, sbDontBreak, 90},
|
||
{sbATerm, prSep}: {sbParaSep, sbDontBreak, 90},
|
||
{sbATerm, prCR}: {sbParaSep, sbDontBreak, 90},
|
||
{sbATerm, prLF}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB7, prSep}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB7, prCR}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB7, prLF}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB8Close, prSep}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB8Close, prCR}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB8Close, prLF}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSTerm, prSep}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSTerm, prCR}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSTerm, prLF}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB8aClose, prSep}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB8aClose, prCR}: {sbParaSep, sbDontBreak, 90},
|
||
{sbSB8aClose, prLF}: {sbParaSep, sbDontBreak, 90},
|
||
|
||
// SB10.
|
||
{sbSB8Sp, prSp}: {sbSB8Sp, sbDontBreak, 100},
|
||
{sbSB8aSp, prSp}: {sbSB8aSp, sbDontBreak, 100},
|
||
{sbSB8Sp, prSep}: {sbParaSep, sbDontBreak, 100},
|
||
{sbSB8Sp, prCR}: {sbParaSep, sbDontBreak, 100},
|
||
{sbSB8Sp, prLF}: {sbParaSep, sbDontBreak, 100},
|
||
|
||
// SB11.
|
||
{sbATerm, prAny}: {sbAny, sbBreak, 110},
|
||
{sbSB7, prAny}: {sbAny, sbBreak, 110},
|
||
{sbSB8Close, prAny}: {sbAny, sbBreak, 110},
|
||
{sbSB8Sp, prAny}: {sbAny, sbBreak, 110},
|
||
{sbSTerm, prAny}: {sbAny, sbBreak, 110},
|
||
{sbSB8aClose, prAny}: {sbAny, sbBreak, 110},
|
||
{sbSB8aSp, prAny}: {sbAny, sbBreak, 110},
|
||
// We'll always break after ParaSep due to SB4.
|
||
}
|
||
|
||
// transitionSentenceBreakState determines the new state of the sentence break
|
||
// parser given the current state and the next code point. It also returns
|
||
// whether a sentence boundary was detected. If more than one code point is
|
||
// needed to determine the new state, the byte slice or the string starting
|
||
// after rune "r" can be used (whichever is not nil or empty) for further
|
||
// lookups.
|
||
func transitionSentenceBreakState(state int, r rune, b []byte, str string) (newState int, sentenceBreak bool) {
|
||
// Determine the property of the next character.
|
||
nextProperty := property(sentenceBreakCodePoints, r)
|
||
|
||
// SB5 (Replacing Ignore Rules).
|
||
if nextProperty == prExtend || nextProperty == prFormat {
|
||
if state == sbParaSep || state == sbCR {
|
||
return sbAny, true // Make sure we don't apply SB5 to SB3 or SB4.
|
||
}
|
||
if state < 0 {
|
||
return sbAny, true // SB1.
|
||
}
|
||
return state, false
|
||
}
|
||
|
||
// Find the applicable transition in the table.
|
||
var rule int
|
||
transition, ok := sbTransitions[[2]int{state, nextProperty}]
|
||
if ok {
|
||
// We have a specific transition. We'll use it.
|
||
newState, sentenceBreak, rule = transition[0], transition[1] == sbBreak, transition[2]
|
||
} else {
|
||
// No specific transition found. Try the less specific ones.
|
||
transAnyProp, okAnyProp := sbTransitions[[2]int{state, prAny}]
|
||
transAnyState, okAnyState := sbTransitions[[2]int{sbAny, nextProperty}]
|
||
if okAnyProp && okAnyState {
|
||
// Both apply. We'll use a mix (see comments for grTransitions).
|
||
newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
|
||
if transAnyProp[2] < transAnyState[2] {
|
||
sentenceBreak, rule = transAnyProp[1] == sbBreak, transAnyProp[2]
|
||
}
|
||
} else if okAnyProp {
|
||
// We only have a specific state.
|
||
newState, sentenceBreak, rule = transAnyProp[0], transAnyProp[1] == sbBreak, transAnyProp[2]
|
||
// This branch will probably never be reached because okAnyState will
|
||
// always be true given the current transition map. But we keep it here
|
||
// for future modifications to the transition map where this may not be
|
||
// true anymore.
|
||
} else if okAnyState {
|
||
// We only have a specific property.
|
||
newState, sentenceBreak, rule = transAnyState[0], transAnyState[1] == sbBreak, transAnyState[2]
|
||
} else {
|
||
// No known transition. SB999: Any × Any.
|
||
newState, sentenceBreak, rule = sbAny, false, 9990
|
||
}
|
||
}
|
||
|
||
// SB8.
|
||
if rule > 80 && (state == sbATerm || state == sbSB8Close || state == sbSB8Sp || state == sbSB7) {
|
||
// Check the right side of the rule.
|
||
var length int
|
||
for nextProperty != prOLetter &&
|
||
nextProperty != prUpper &&
|
||
nextProperty != prLower &&
|
||
nextProperty != prSep &&
|
||
nextProperty != prCR &&
|
||
nextProperty != prLF &&
|
||
nextProperty != prATerm &&
|
||
nextProperty != prSTerm {
|
||
// Move on to the next rune.
|
||
if b != nil { // Byte slice version.
|
||
r, length = utf8.DecodeRune(b)
|
||
b = b[length:]
|
||
} else { // String version.
|
||
r, length = utf8.DecodeRuneInString(str)
|
||
str = str[length:]
|
||
}
|
||
if r == utf8.RuneError {
|
||
break
|
||
}
|
||
nextProperty = property(sentenceBreakCodePoints, r)
|
||
}
|
||
if nextProperty == prLower {
|
||
return sbLower, false
|
||
}
|
||
}
|
||
|
||
return
|
||
}
|