split.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. // Copyright 2015 go-swagger maintainers
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. package swag
  15. import (
  16. "bytes"
  17. "sync"
  18. "unicode"
  19. "unicode/utf8"
  20. )
  21. type (
  22. splitter struct {
  23. initialisms []string
  24. initialismsRunes [][]rune
  25. initialismsUpperCased [][]rune // initialisms cached in their trimmed, upper-cased version
  26. postSplitInitialismCheck bool
  27. }
  28. splitterOption func(*splitter)
  29. initialismMatch struct {
  30. body []rune
  31. start, end int
  32. complete bool
  33. }
  34. initialismMatches []initialismMatch
  35. )
  36. type (
  37. // memory pools of temporary objects.
  38. //
  39. // These are used to recycle temporarily allocated objects
  40. // and relieve the GC from undue pressure.
  41. matchesPool struct {
  42. *sync.Pool
  43. }
  44. buffersPool struct {
  45. *sync.Pool
  46. }
  47. lexemsPool struct {
  48. *sync.Pool
  49. }
  50. splittersPool struct {
  51. *sync.Pool
  52. }
  53. )
  54. var (
  55. // poolOfMatches holds temporary slices for recycling during the initialism match process
  56. poolOfMatches = matchesPool{
  57. Pool: &sync.Pool{
  58. New: func() any {
  59. s := make(initialismMatches, 0, maxAllocMatches)
  60. return &s
  61. },
  62. },
  63. }
  64. poolOfBuffers = buffersPool{
  65. Pool: &sync.Pool{
  66. New: func() any {
  67. return new(bytes.Buffer)
  68. },
  69. },
  70. }
  71. poolOfLexems = lexemsPool{
  72. Pool: &sync.Pool{
  73. New: func() any {
  74. s := make([]nameLexem, 0, maxAllocMatches)
  75. return &s
  76. },
  77. },
  78. }
  79. poolOfSplitters = splittersPool{
  80. Pool: &sync.Pool{
  81. New: func() any {
  82. s := newSplitter()
  83. return &s
  84. },
  85. },
  86. }
  87. )
  88. // nameReplaceTable finds a word representation for special characters.
  89. func nameReplaceTable(r rune) (string, bool) {
  90. switch r {
  91. case '@':
  92. return "At ", true
  93. case '&':
  94. return "And ", true
  95. case '|':
  96. return "Pipe ", true
  97. case '$':
  98. return "Dollar ", true
  99. case '!':
  100. return "Bang ", true
  101. case '-':
  102. return "", true
  103. case '_':
  104. return "", true
  105. default:
  106. return "", false
  107. }
  108. }
  109. // split calls the splitter.
  110. //
  111. // Use newSplitter for more control and options
  112. func split(str string) []string {
  113. s := poolOfSplitters.BorrowSplitter()
  114. lexems := s.split(str)
  115. result := make([]string, 0, len(*lexems))
  116. for _, lexem := range *lexems {
  117. result = append(result, lexem.GetOriginal())
  118. }
  119. poolOfLexems.RedeemLexems(lexems)
  120. poolOfSplitters.RedeemSplitter(s)
  121. return result
  122. }
  123. func newSplitter(options ...splitterOption) splitter {
  124. s := splitter{
  125. postSplitInitialismCheck: false,
  126. initialisms: initialisms,
  127. initialismsRunes: initialismsRunes,
  128. initialismsUpperCased: initialismsUpperCased,
  129. }
  130. for _, option := range options {
  131. option(&s)
  132. }
  133. return s
  134. }
  135. // withPostSplitInitialismCheck allows to catch initialisms after main split process
  136. func withPostSplitInitialismCheck(s *splitter) {
  137. s.postSplitInitialismCheck = true
  138. }
  139. func (p matchesPool) BorrowMatches() *initialismMatches {
  140. s := p.Get().(*initialismMatches)
  141. *s = (*s)[:0] // reset slice, keep allocated capacity
  142. return s
  143. }
  144. func (p buffersPool) BorrowBuffer(size int) *bytes.Buffer {
  145. s := p.Get().(*bytes.Buffer)
  146. s.Reset()
  147. if s.Cap() < size {
  148. s.Grow(size)
  149. }
  150. return s
  151. }
  152. func (p lexemsPool) BorrowLexems() *[]nameLexem {
  153. s := p.Get().(*[]nameLexem)
  154. *s = (*s)[:0] // reset slice, keep allocated capacity
  155. return s
  156. }
  157. func (p splittersPool) BorrowSplitter(options ...splitterOption) *splitter {
  158. s := p.Get().(*splitter)
  159. s.postSplitInitialismCheck = false // reset options
  160. for _, apply := range options {
  161. apply(s)
  162. }
  163. return s
  164. }
  165. func (p matchesPool) RedeemMatches(s *initialismMatches) {
  166. p.Put(s)
  167. }
  168. func (p buffersPool) RedeemBuffer(s *bytes.Buffer) {
  169. p.Put(s)
  170. }
  171. func (p lexemsPool) RedeemLexems(s *[]nameLexem) {
  172. p.Put(s)
  173. }
  174. func (p splittersPool) RedeemSplitter(s *splitter) {
  175. p.Put(s)
  176. }
  177. func (m initialismMatch) isZero() bool {
  178. return m.start == 0 && m.end == 0
  179. }
  180. func (s splitter) split(name string) *[]nameLexem {
  181. nameRunes := []rune(name)
  182. matches := s.gatherInitialismMatches(nameRunes)
  183. if matches == nil {
  184. return poolOfLexems.BorrowLexems()
  185. }
  186. return s.mapMatchesToNameLexems(nameRunes, matches)
  187. }
  188. func (s splitter) gatherInitialismMatches(nameRunes []rune) *initialismMatches {
  189. var matches *initialismMatches
  190. for currentRunePosition, currentRune := range nameRunes {
  191. // recycle these allocations as we loop over runes
  192. // with such recycling, only 2 slices should be allocated per call
  193. // instead of o(n).
  194. newMatches := poolOfMatches.BorrowMatches()
  195. // check current initialism matches
  196. if matches != nil { // skip first iteration
  197. for _, match := range *matches {
  198. if keepCompleteMatch := match.complete; keepCompleteMatch {
  199. *newMatches = append(*newMatches, match)
  200. continue
  201. }
  202. // drop failed match
  203. currentMatchRune := match.body[currentRunePosition-match.start]
  204. if currentMatchRune != currentRune {
  205. continue
  206. }
  207. // try to complete ongoing match
  208. if currentRunePosition-match.start == len(match.body)-1 {
  209. // we are close; the next step is to check the symbol ahead
  210. // if it is a small letter, then it is not the end of match
  211. // but beginning of the next word
  212. if currentRunePosition < len(nameRunes)-1 {
  213. nextRune := nameRunes[currentRunePosition+1]
  214. if newWord := unicode.IsLower(nextRune); newWord {
  215. // oh ok, it was the start of a new word
  216. continue
  217. }
  218. }
  219. match.complete = true
  220. match.end = currentRunePosition
  221. }
  222. *newMatches = append(*newMatches, match)
  223. }
  224. }
  225. // check for new initialism matches
  226. for i := range s.initialisms {
  227. initialismRunes := s.initialismsRunes[i]
  228. if initialismRunes[0] == currentRune {
  229. *newMatches = append(*newMatches, initialismMatch{
  230. start: currentRunePosition,
  231. body: initialismRunes,
  232. complete: false,
  233. })
  234. }
  235. }
  236. if matches != nil {
  237. poolOfMatches.RedeemMatches(matches)
  238. }
  239. matches = newMatches
  240. }
  241. // up to the caller to redeem this last slice
  242. return matches
  243. }
  244. func (s splitter) mapMatchesToNameLexems(nameRunes []rune, matches *initialismMatches) *[]nameLexem {
  245. nameLexems := poolOfLexems.BorrowLexems()
  246. var lastAcceptedMatch initialismMatch
  247. for _, match := range *matches {
  248. if !match.complete {
  249. continue
  250. }
  251. if firstMatch := lastAcceptedMatch.isZero(); firstMatch {
  252. s.appendBrokenDownCasualString(nameLexems, nameRunes[:match.start])
  253. *nameLexems = append(*nameLexems, s.breakInitialism(string(match.body)))
  254. lastAcceptedMatch = match
  255. continue
  256. }
  257. if overlappedMatch := match.start <= lastAcceptedMatch.end; overlappedMatch {
  258. continue
  259. }
  260. middle := nameRunes[lastAcceptedMatch.end+1 : match.start]
  261. s.appendBrokenDownCasualString(nameLexems, middle)
  262. *nameLexems = append(*nameLexems, s.breakInitialism(string(match.body)))
  263. lastAcceptedMatch = match
  264. }
  265. // we have not found any accepted matches
  266. if lastAcceptedMatch.isZero() {
  267. *nameLexems = (*nameLexems)[:0]
  268. s.appendBrokenDownCasualString(nameLexems, nameRunes)
  269. } else if lastAcceptedMatch.end+1 != len(nameRunes) {
  270. rest := nameRunes[lastAcceptedMatch.end+1:]
  271. s.appendBrokenDownCasualString(nameLexems, rest)
  272. }
  273. poolOfMatches.RedeemMatches(matches)
  274. return nameLexems
  275. }
  276. func (s splitter) breakInitialism(original string) nameLexem {
  277. return newInitialismNameLexem(original, original)
  278. }
  279. func (s splitter) appendBrokenDownCasualString(segments *[]nameLexem, str []rune) {
  280. currentSegment := poolOfBuffers.BorrowBuffer(len(str)) // unlike strings.Builder, bytes.Buffer initial storage can reused
  281. defer func() {
  282. poolOfBuffers.RedeemBuffer(currentSegment)
  283. }()
  284. addCasualNameLexem := func(original string) {
  285. *segments = append(*segments, newCasualNameLexem(original))
  286. }
  287. addInitialismNameLexem := func(original, match string) {
  288. *segments = append(*segments, newInitialismNameLexem(original, match))
  289. }
  290. var addNameLexem func(string)
  291. if s.postSplitInitialismCheck {
  292. addNameLexem = func(original string) {
  293. for i := range s.initialisms {
  294. if isEqualFoldIgnoreSpace(s.initialismsUpperCased[i], original) {
  295. addInitialismNameLexem(original, s.initialisms[i])
  296. return
  297. }
  298. }
  299. addCasualNameLexem(original)
  300. }
  301. } else {
  302. addNameLexem = addCasualNameLexem
  303. }
  304. for _, rn := range str {
  305. if replace, found := nameReplaceTable(rn); found {
  306. if currentSegment.Len() > 0 {
  307. addNameLexem(currentSegment.String())
  308. currentSegment.Reset()
  309. }
  310. if replace != "" {
  311. addNameLexem(replace)
  312. }
  313. continue
  314. }
  315. if !unicode.In(rn, unicode.L, unicode.M, unicode.N, unicode.Pc) {
  316. if currentSegment.Len() > 0 {
  317. addNameLexem(currentSegment.String())
  318. currentSegment.Reset()
  319. }
  320. continue
  321. }
  322. if unicode.IsUpper(rn) {
  323. if currentSegment.Len() > 0 {
  324. addNameLexem(currentSegment.String())
  325. }
  326. currentSegment.Reset()
  327. }
  328. currentSegment.WriteRune(rn)
  329. }
  330. if currentSegment.Len() > 0 {
  331. addNameLexem(currentSegment.String())
  332. }
  333. }
  334. // isEqualFoldIgnoreSpace is the same as strings.EqualFold, but
  335. // it ignores leading and trailing blank spaces in the compared
  336. // string.
  337. //
  338. // base is assumed to be composed of upper-cased runes, and be already
  339. // trimmed.
  340. //
  341. // This code is heavily inspired from strings.EqualFold.
  342. func isEqualFoldIgnoreSpace(base []rune, str string) bool {
  343. var i, baseIndex int
  344. // equivalent to b := []byte(str), but without data copy
  345. b := hackStringBytes(str)
  346. for i < len(b) {
  347. if c := b[i]; c < utf8.RuneSelf {
  348. // fast path for ASCII
  349. if c != ' ' && c != '\t' {
  350. break
  351. }
  352. i++
  353. continue
  354. }
  355. // unicode case
  356. r, size := utf8.DecodeRune(b[i:])
  357. if !unicode.IsSpace(r) {
  358. break
  359. }
  360. i += size
  361. }
  362. if i >= len(b) {
  363. return len(base) == 0
  364. }
  365. for _, baseRune := range base {
  366. if i >= len(b) {
  367. break
  368. }
  369. if c := b[i]; c < utf8.RuneSelf {
  370. // single byte rune case (ASCII)
  371. if baseRune >= utf8.RuneSelf {
  372. return false
  373. }
  374. baseChar := byte(baseRune)
  375. if c != baseChar &&
  376. !('a' <= c && c <= 'z' && c-'a'+'A' == baseChar) {
  377. return false
  378. }
  379. baseIndex++
  380. i++
  381. continue
  382. }
  383. // unicode case
  384. r, size := utf8.DecodeRune(b[i:])
  385. if unicode.ToUpper(r) != baseRune {
  386. return false
  387. }
  388. baseIndex++
  389. i += size
  390. }
  391. if baseIndex != len(base) {
  392. return false
  393. }
  394. // all passed: now we should only have blanks
  395. for i < len(b) {
  396. if c := b[i]; c < utf8.RuneSelf {
  397. // fast path for ASCII
  398. if c != ' ' && c != '\t' {
  399. return false
  400. }
  401. i++
  402. continue
  403. }
  404. // unicode case
  405. r, size := utf8.DecodeRune(b[i:])
  406. if !unicode.IsSpace(r) {
  407. return false
  408. }
  409. i += size
  410. }
  411. return true
  412. }