lex.go 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225
  1. package toml
  2. import (
  3. "fmt"
  4. "reflect"
  5. "runtime"
  6. "strings"
  7. "unicode"
  8. "unicode/utf8"
  9. )
  10. type itemType int
  11. const (
  12. itemError itemType = iota
  13. itemNIL // used in the parser to indicate no type
  14. itemEOF
  15. itemText
  16. itemString
  17. itemRawString
  18. itemMultilineString
  19. itemRawMultilineString
  20. itemBool
  21. itemInteger
  22. itemFloat
  23. itemDatetime
  24. itemArray // the start of an array
  25. itemArrayEnd
  26. itemTableStart
  27. itemTableEnd
  28. itemArrayTableStart
  29. itemArrayTableEnd
  30. itemKeyStart
  31. itemKeyEnd
  32. itemCommentStart
  33. itemInlineTableStart
  34. itemInlineTableEnd
  35. )
  36. const (
  37. eof = 0
  38. comma = ','
  39. tableStart = '['
  40. tableEnd = ']'
  41. arrayTableStart = '['
  42. arrayTableEnd = ']'
  43. tableSep = '.'
  44. keySep = '='
  45. arrayStart = '['
  46. arrayEnd = ']'
  47. commentStart = '#'
  48. stringStart = '"'
  49. stringEnd = '"'
  50. rawStringStart = '\''
  51. rawStringEnd = '\''
  52. inlineTableStart = '{'
  53. inlineTableEnd = '}'
  54. )
  55. type stateFn func(lx *lexer) stateFn
  56. type lexer struct {
  57. input string
  58. start int
  59. pos int
  60. line int
  61. state stateFn
  62. items chan item
  63. // Allow for backing up up to four runes.
  64. // This is necessary because TOML contains 3-rune tokens (""" and ''').
  65. prevWidths [4]int
  66. nprev int // how many of prevWidths are in use
  67. // If we emit an eof, we can still back up, but it is not OK to call
  68. // next again.
  69. atEOF bool
  70. // A stack of state functions used to maintain context.
  71. // The idea is to reuse parts of the state machine in various places.
  72. // For example, values can appear at the top level or within arbitrarily
  73. // nested arrays. The last state on the stack is used after a value has
  74. // been lexed. Similarly for comments.
  75. stack []stateFn
  76. }
  77. type item struct {
  78. typ itemType
  79. val string
  80. line int
  81. }
  82. func (lx *lexer) nextItem() item {
  83. for {
  84. select {
  85. case item := <-lx.items:
  86. return item
  87. default:
  88. lx.state = lx.state(lx)
  89. //fmt.Printf(" STATE %-24s current: %-10q stack: %s\n", lx.state, lx.current(), lx.stack)
  90. }
  91. }
  92. }
  93. func lex(input string) *lexer {
  94. lx := &lexer{
  95. input: input,
  96. state: lexTop,
  97. line: 1,
  98. items: make(chan item, 10),
  99. stack: make([]stateFn, 0, 10),
  100. }
  101. return lx
  102. }
  103. func (lx *lexer) push(state stateFn) {
  104. lx.stack = append(lx.stack, state)
  105. }
  106. func (lx *lexer) pop() stateFn {
  107. if len(lx.stack) == 0 {
  108. return lx.errorf("BUG in lexer: no states to pop")
  109. }
  110. last := lx.stack[len(lx.stack)-1]
  111. lx.stack = lx.stack[0 : len(lx.stack)-1]
  112. return last
  113. }
  114. func (lx *lexer) current() string {
  115. return lx.input[lx.start:lx.pos]
  116. }
  117. func (lx *lexer) emit(typ itemType) {
  118. lx.items <- item{typ, lx.current(), lx.line}
  119. lx.start = lx.pos
  120. }
  121. func (lx *lexer) emitTrim(typ itemType) {
  122. lx.items <- item{typ, strings.TrimSpace(lx.current()), lx.line}
  123. lx.start = lx.pos
  124. }
  125. func (lx *lexer) next() (r rune) {
  126. if lx.atEOF {
  127. panic("BUG in lexer: next called after EOF")
  128. }
  129. if lx.pos >= len(lx.input) {
  130. lx.atEOF = true
  131. return eof
  132. }
  133. if lx.input[lx.pos] == '\n' {
  134. lx.line++
  135. }
  136. lx.prevWidths[3] = lx.prevWidths[2]
  137. lx.prevWidths[2] = lx.prevWidths[1]
  138. lx.prevWidths[1] = lx.prevWidths[0]
  139. if lx.nprev < 4 {
  140. lx.nprev++
  141. }
  142. r, w := utf8.DecodeRuneInString(lx.input[lx.pos:])
  143. if r == utf8.RuneError {
  144. lx.errorf("invalid UTF-8 byte at position %d (line %d): 0x%02x", lx.pos, lx.line, lx.input[lx.pos])
  145. return utf8.RuneError
  146. }
  147. lx.prevWidths[0] = w
  148. lx.pos += w
  149. return r
  150. }
  151. // ignore skips over the pending input before this point.
  152. func (lx *lexer) ignore() {
  153. lx.start = lx.pos
  154. }
  155. // backup steps back one rune. Can be called 4 times between calls to next.
  156. func (lx *lexer) backup() {
  157. if lx.atEOF {
  158. lx.atEOF = false
  159. return
  160. }
  161. if lx.nprev < 1 {
  162. panic("BUG in lexer: backed up too far")
  163. }
  164. w := lx.prevWidths[0]
  165. lx.prevWidths[0] = lx.prevWidths[1]
  166. lx.prevWidths[1] = lx.prevWidths[2]
  167. lx.prevWidths[2] = lx.prevWidths[3]
  168. lx.nprev--
  169. lx.pos -= w
  170. if lx.pos < len(lx.input) && lx.input[lx.pos] == '\n' {
  171. lx.line--
  172. }
  173. }
  174. // accept consumes the next rune if it's equal to `valid`.
  175. func (lx *lexer) accept(valid rune) bool {
  176. if lx.next() == valid {
  177. return true
  178. }
  179. lx.backup()
  180. return false
  181. }
  182. // peek returns but does not consume the next rune in the input.
  183. func (lx *lexer) peek() rune {
  184. r := lx.next()
  185. lx.backup()
  186. return r
  187. }
  188. // skip ignores all input that matches the given predicate.
  189. func (lx *lexer) skip(pred func(rune) bool) {
  190. for {
  191. r := lx.next()
  192. if pred(r) {
  193. continue
  194. }
  195. lx.backup()
  196. lx.ignore()
  197. return
  198. }
  199. }
  200. // errorf stops all lexing by emitting an error and returning `nil`.
  201. // Note that any value that is a character is escaped if it's a special
  202. // character (newlines, tabs, etc.).
  203. func (lx *lexer) errorf(format string, values ...interface{}) stateFn {
  204. lx.items <- item{
  205. itemError,
  206. fmt.Sprintf(format, values...),
  207. lx.line,
  208. }
  209. return nil
  210. }
  211. // lexTop consumes elements at the top level of TOML data.
  212. func lexTop(lx *lexer) stateFn {
  213. r := lx.next()
  214. if isWhitespace(r) || isNL(r) {
  215. return lexSkip(lx, lexTop)
  216. }
  217. switch r {
  218. case commentStart:
  219. lx.push(lexTop)
  220. return lexCommentStart
  221. case tableStart:
  222. return lexTableStart
  223. case eof:
  224. if lx.pos > lx.start {
  225. return lx.errorf("unexpected EOF")
  226. }
  227. lx.emit(itemEOF)
  228. return nil
  229. }
  230. // At this point, the only valid item can be a key, so we back up
  231. // and let the key lexer do the rest.
  232. lx.backup()
  233. lx.push(lexTopEnd)
  234. return lexKeyStart
  235. }
  236. // lexTopEnd is entered whenever a top-level item has been consumed. (A value
  237. // or a table.) It must see only whitespace, and will turn back to lexTop
  238. // upon a newline. If it sees EOF, it will quit the lexer successfully.
  239. func lexTopEnd(lx *lexer) stateFn {
  240. r := lx.next()
  241. switch {
  242. case r == commentStart:
  243. // a comment will read to a newline for us.
  244. lx.push(lexTop)
  245. return lexCommentStart
  246. case isWhitespace(r):
  247. return lexTopEnd
  248. case isNL(r):
  249. lx.ignore()
  250. return lexTop
  251. case r == eof:
  252. lx.emit(itemEOF)
  253. return nil
  254. }
  255. return lx.errorf(
  256. "expected a top-level item to end with a newline, comment, or EOF, but got %q instead",
  257. r)
  258. }
  259. // lexTable lexes the beginning of a table. Namely, it makes sure that
  260. // it starts with a character other than '.' and ']'.
  261. // It assumes that '[' has already been consumed.
  262. // It also handles the case that this is an item in an array of tables.
  263. // e.g., '[[name]]'.
  264. func lexTableStart(lx *lexer) stateFn {
  265. if lx.peek() == arrayTableStart {
  266. lx.next()
  267. lx.emit(itemArrayTableStart)
  268. lx.push(lexArrayTableEnd)
  269. } else {
  270. lx.emit(itemTableStart)
  271. lx.push(lexTableEnd)
  272. }
  273. return lexTableNameStart
  274. }
  275. func lexTableEnd(lx *lexer) stateFn {
  276. lx.emit(itemTableEnd)
  277. return lexTopEnd
  278. }
  279. func lexArrayTableEnd(lx *lexer) stateFn {
  280. if r := lx.next(); r != arrayTableEnd {
  281. return lx.errorf(
  282. "expected end of table array name delimiter %q, but got %q instead",
  283. arrayTableEnd, r)
  284. }
  285. lx.emit(itemArrayTableEnd)
  286. return lexTopEnd
  287. }
  288. func lexTableNameStart(lx *lexer) stateFn {
  289. lx.skip(isWhitespace)
  290. switch r := lx.peek(); {
  291. case r == tableEnd || r == eof:
  292. return lx.errorf("unexpected end of table name (table names cannot be empty)")
  293. case r == tableSep:
  294. return lx.errorf("unexpected table separator (table names cannot be empty)")
  295. case r == stringStart || r == rawStringStart:
  296. lx.ignore()
  297. lx.push(lexTableNameEnd)
  298. return lexQuotedName
  299. default:
  300. lx.push(lexTableNameEnd)
  301. return lexBareName
  302. }
  303. }
  304. // lexTableNameEnd reads the end of a piece of a table name, optionally
  305. // consuming whitespace.
  306. func lexTableNameEnd(lx *lexer) stateFn {
  307. lx.skip(isWhitespace)
  308. switch r := lx.next(); {
  309. case isWhitespace(r):
  310. return lexTableNameEnd
  311. case r == tableSep:
  312. lx.ignore()
  313. return lexTableNameStart
  314. case r == tableEnd:
  315. return lx.pop()
  316. default:
  317. return lx.errorf("expected '.' or ']' to end table name, but got %q instead", r)
  318. }
  319. }
  320. // lexBareName lexes one part of a key or table.
  321. //
  322. // It assumes that at least one valid character for the table has already been
  323. // read.
  324. //
  325. // Lexes only one part, e.g. only 'a' inside 'a.b'.
  326. func lexBareName(lx *lexer) stateFn {
  327. r := lx.next()
  328. if isBareKeyChar(r) {
  329. return lexBareName
  330. }
  331. lx.backup()
  332. lx.emit(itemText)
  333. return lx.pop()
  334. }
  335. // lexBareName lexes one part of a key or table.
  336. //
  337. // It assumes that at least one valid character for the table has already been
  338. // read.
  339. //
  340. // Lexes only one part, e.g. only '"a"' inside '"a".b'.
  341. func lexQuotedName(lx *lexer) stateFn {
  342. r := lx.next()
  343. switch {
  344. case isWhitespace(r):
  345. return lexSkip(lx, lexValue)
  346. case r == stringStart:
  347. lx.ignore() // ignore the '"'
  348. return lexString
  349. case r == rawStringStart:
  350. lx.ignore() // ignore the "'"
  351. return lexRawString
  352. case r == eof:
  353. return lx.errorf("unexpected EOF; expected value")
  354. default:
  355. return lx.errorf("expected value but found %q instead", r)
  356. }
  357. }
  358. // lexKeyStart consumes all key parts until a '='.
  359. func lexKeyStart(lx *lexer) stateFn {
  360. lx.skip(isWhitespace)
  361. switch r := lx.peek(); {
  362. case r == '=' || r == eof:
  363. return lx.errorf("unexpected '=': key name appears blank")
  364. case r == '.':
  365. return lx.errorf("unexpected '.': keys cannot start with a '.'")
  366. case r == stringStart || r == rawStringStart:
  367. lx.ignore()
  368. fallthrough
  369. default: // Bare key
  370. lx.emit(itemKeyStart)
  371. return lexKeyNameStart
  372. }
  373. }
  374. func lexKeyNameStart(lx *lexer) stateFn {
  375. lx.skip(isWhitespace)
  376. switch r := lx.peek(); {
  377. case r == '=' || r == eof:
  378. return lx.errorf("unexpected '='")
  379. case r == '.':
  380. return lx.errorf("unexpected '.'")
  381. case r == stringStart || r == rawStringStart:
  382. lx.ignore()
  383. lx.push(lexKeyEnd)
  384. return lexQuotedName
  385. default:
  386. lx.push(lexKeyEnd)
  387. return lexBareName
  388. }
  389. }
  390. // lexKeyEnd consumes the end of a key and trims whitespace (up to the key
  391. // separator).
  392. func lexKeyEnd(lx *lexer) stateFn {
  393. lx.skip(isWhitespace)
  394. switch r := lx.next(); {
  395. case isWhitespace(r):
  396. return lexSkip(lx, lexKeyEnd)
  397. case r == eof:
  398. return lx.errorf("unexpected EOF; expected key separator %q", keySep)
  399. case r == '.':
  400. lx.ignore()
  401. return lexKeyNameStart
  402. case r == '=':
  403. lx.emit(itemKeyEnd)
  404. return lexSkip(lx, lexValue)
  405. default:
  406. return lx.errorf("expected '.' or '=', but got %q instead", r)
  407. }
  408. }
  409. // lexValue starts the consumption of a value anywhere a value is expected.
  410. // lexValue will ignore whitespace.
  411. // After a value is lexed, the last state on the next is popped and returned.
  412. func lexValue(lx *lexer) stateFn {
  413. // We allow whitespace to precede a value, but NOT newlines.
  414. // In array syntax, the array states are responsible for ignoring newlines.
  415. r := lx.next()
  416. switch {
  417. case isWhitespace(r):
  418. return lexSkip(lx, lexValue)
  419. case isDigit(r):
  420. lx.backup() // avoid an extra state and use the same as above
  421. return lexNumberOrDateStart
  422. }
  423. switch r {
  424. case arrayStart:
  425. lx.ignore()
  426. lx.emit(itemArray)
  427. return lexArrayValue
  428. case inlineTableStart:
  429. lx.ignore()
  430. lx.emit(itemInlineTableStart)
  431. return lexInlineTableValue
  432. case stringStart:
  433. if lx.accept(stringStart) {
  434. if lx.accept(stringStart) {
  435. lx.ignore() // Ignore """
  436. return lexMultilineString
  437. }
  438. lx.backup()
  439. }
  440. lx.ignore() // ignore the '"'
  441. return lexString
  442. case rawStringStart:
  443. if lx.accept(rawStringStart) {
  444. if lx.accept(rawStringStart) {
  445. lx.ignore() // Ignore """
  446. return lexMultilineRawString
  447. }
  448. lx.backup()
  449. }
  450. lx.ignore() // ignore the "'"
  451. return lexRawString
  452. case '.': // special error case, be kind to users
  453. return lx.errorf("floats must start with a digit, not '.'")
  454. case 'i', 'n':
  455. if (lx.accept('n') && lx.accept('f')) || (lx.accept('a') && lx.accept('n')) {
  456. lx.emit(itemFloat)
  457. return lx.pop()
  458. }
  459. case '-', '+':
  460. return lexDecimalNumberStart
  461. }
  462. if unicode.IsLetter(r) {
  463. // Be permissive here; lexBool will give a nice error if the
  464. // user wrote something like
  465. // x = foo
  466. // (i.e. not 'true' or 'false' but is something else word-like.)
  467. lx.backup()
  468. return lexBool
  469. }
  470. if r == eof {
  471. return lx.errorf("unexpected EOF; expected value")
  472. }
  473. return lx.errorf("expected value but found %q instead", r)
  474. }
  475. // lexArrayValue consumes one value in an array. It assumes that '[' or ','
  476. // have already been consumed. All whitespace and newlines are ignored.
  477. func lexArrayValue(lx *lexer) stateFn {
  478. r := lx.next()
  479. switch {
  480. case isWhitespace(r) || isNL(r):
  481. return lexSkip(lx, lexArrayValue)
  482. case r == commentStart:
  483. lx.push(lexArrayValue)
  484. return lexCommentStart
  485. case r == comma:
  486. return lx.errorf("unexpected comma")
  487. case r == arrayEnd:
  488. // NOTE(caleb): The spec isn't clear about whether you can have
  489. // a trailing comma or not, so we'll allow it.
  490. return lexArrayEnd
  491. }
  492. lx.backup()
  493. lx.push(lexArrayValueEnd)
  494. return lexValue
  495. }
  496. // lexArrayValueEnd consumes everything between the end of an array value and
  497. // the next value (or the end of the array): it ignores whitespace and newlines
  498. // and expects either a ',' or a ']'.
  499. func lexArrayValueEnd(lx *lexer) stateFn {
  500. r := lx.next()
  501. switch {
  502. case isWhitespace(r) || isNL(r):
  503. return lexSkip(lx, lexArrayValueEnd)
  504. case r == commentStart:
  505. lx.push(lexArrayValueEnd)
  506. return lexCommentStart
  507. case r == comma:
  508. lx.ignore()
  509. return lexArrayValue // move on to the next value
  510. case r == arrayEnd:
  511. return lexArrayEnd
  512. }
  513. return lx.errorf(
  514. "expected a comma or array terminator %q, but got %s instead",
  515. arrayEnd, runeOrEOF(r))
  516. }
  517. // lexArrayEnd finishes the lexing of an array.
  518. // It assumes that a ']' has just been consumed.
  519. func lexArrayEnd(lx *lexer) stateFn {
  520. lx.ignore()
  521. lx.emit(itemArrayEnd)
  522. return lx.pop()
  523. }
  524. // lexInlineTableValue consumes one key/value pair in an inline table.
  525. // It assumes that '{' or ',' have already been consumed. Whitespace is ignored.
  526. func lexInlineTableValue(lx *lexer) stateFn {
  527. r := lx.next()
  528. switch {
  529. case isWhitespace(r):
  530. return lexSkip(lx, lexInlineTableValue)
  531. case isNL(r):
  532. return lx.errorf("newlines not allowed within inline tables")
  533. case r == commentStart:
  534. lx.push(lexInlineTableValue)
  535. return lexCommentStart
  536. case r == comma:
  537. return lx.errorf("unexpected comma")
  538. case r == inlineTableEnd:
  539. return lexInlineTableEnd
  540. }
  541. lx.backup()
  542. lx.push(lexInlineTableValueEnd)
  543. return lexKeyStart
  544. }
  545. // lexInlineTableValueEnd consumes everything between the end of an inline table
  546. // key/value pair and the next pair (or the end of the table):
  547. // it ignores whitespace and expects either a ',' or a '}'.
  548. func lexInlineTableValueEnd(lx *lexer) stateFn {
  549. switch r := lx.next(); {
  550. case isWhitespace(r):
  551. return lexSkip(lx, lexInlineTableValueEnd)
  552. case isNL(r):
  553. return lx.errorf("newlines not allowed within inline tables")
  554. case r == commentStart:
  555. lx.push(lexInlineTableValueEnd)
  556. return lexCommentStart
  557. case r == comma:
  558. lx.ignore()
  559. lx.skip(isWhitespace)
  560. if lx.peek() == '}' {
  561. return lx.errorf("trailing comma not allowed in inline tables")
  562. }
  563. return lexInlineTableValue
  564. case r == inlineTableEnd:
  565. return lexInlineTableEnd
  566. default:
  567. return lx.errorf(
  568. "expected a comma or an inline table terminator %q, but got %s instead",
  569. inlineTableEnd, runeOrEOF(r))
  570. }
  571. }
  572. func runeOrEOF(r rune) string {
  573. if r == eof {
  574. return "end of file"
  575. }
  576. return "'" + string(r) + "'"
  577. }
  578. // lexInlineTableEnd finishes the lexing of an inline table.
  579. // It assumes that a '}' has just been consumed.
  580. func lexInlineTableEnd(lx *lexer) stateFn {
  581. lx.ignore()
  582. lx.emit(itemInlineTableEnd)
  583. return lx.pop()
  584. }
  585. // lexString consumes the inner contents of a string. It assumes that the
  586. // beginning '"' has already been consumed and ignored.
  587. func lexString(lx *lexer) stateFn {
  588. r := lx.next()
  589. switch {
  590. case r == eof:
  591. return lx.errorf(`unexpected EOF; expected '"'`)
  592. case isControl(r) || r == '\r':
  593. return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
  594. case isNL(r):
  595. return lx.errorf("strings cannot contain newlines")
  596. case r == '\\':
  597. lx.push(lexString)
  598. return lexStringEscape
  599. case r == stringEnd:
  600. lx.backup()
  601. lx.emit(itemString)
  602. lx.next()
  603. lx.ignore()
  604. return lx.pop()
  605. }
  606. return lexString
  607. }
  608. // lexMultilineString consumes the inner contents of a string. It assumes that
  609. // the beginning '"""' has already been consumed and ignored.
  610. func lexMultilineString(lx *lexer) stateFn {
  611. r := lx.next()
  612. switch r {
  613. case eof:
  614. return lx.errorf(`unexpected EOF; expected '"""'`)
  615. case '\r':
  616. if lx.peek() != '\n' {
  617. return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
  618. }
  619. return lexMultilineString
  620. case '\\':
  621. return lexMultilineStringEscape
  622. case stringEnd:
  623. /// Found " → try to read two more "".
  624. if lx.accept(stringEnd) {
  625. if lx.accept(stringEnd) {
  626. /// Peek ahead: the string can contain " and "", including at the
  627. /// end: """str"""""
  628. /// 6 or more at the end, however, is an error.
  629. if lx.peek() == stringEnd {
  630. /// Check if we already lexed 5 's; if so we have 6 now, and
  631. /// that's just too many man!
  632. if strings.HasSuffix(lx.current(), `"""""`) {
  633. return lx.errorf(`unexpected '""""""'`)
  634. }
  635. lx.backup()
  636. lx.backup()
  637. return lexMultilineString
  638. }
  639. lx.backup() /// backup: don't include the """ in the item.
  640. lx.backup()
  641. lx.backup()
  642. lx.emit(itemMultilineString)
  643. lx.next() /// Read over ''' again and discard it.
  644. lx.next()
  645. lx.next()
  646. lx.ignore()
  647. return lx.pop()
  648. }
  649. lx.backup()
  650. }
  651. }
  652. if isControl(r) {
  653. return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
  654. }
  655. return lexMultilineString
  656. }
  657. // lexRawString consumes a raw string. Nothing can be escaped in such a string.
  658. // It assumes that the beginning "'" has already been consumed and ignored.
  659. func lexRawString(lx *lexer) stateFn {
  660. r := lx.next()
  661. switch {
  662. case r == eof:
  663. return lx.errorf(`unexpected EOF; expected "'"`)
  664. case isControl(r) || r == '\r':
  665. return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
  666. case isNL(r):
  667. return lx.errorf("strings cannot contain newlines")
  668. case r == rawStringEnd:
  669. lx.backup()
  670. lx.emit(itemRawString)
  671. lx.next()
  672. lx.ignore()
  673. return lx.pop()
  674. }
  675. return lexRawString
  676. }
  677. // lexMultilineRawString consumes a raw string. Nothing can be escaped in such
  678. // a string. It assumes that the beginning "'''" has already been consumed and
  679. // ignored.
  680. func lexMultilineRawString(lx *lexer) stateFn {
  681. r := lx.next()
  682. switch r {
  683. case eof:
  684. return lx.errorf(`unexpected EOF; expected "'''"`)
  685. case '\r':
  686. if lx.peek() != '\n' {
  687. return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
  688. }
  689. return lexMultilineRawString
  690. case rawStringEnd:
  691. /// Found ' → try to read two more ''.
  692. if lx.accept(rawStringEnd) {
  693. if lx.accept(rawStringEnd) {
  694. /// Peek ahead: the string can contain ' and '', including at the
  695. /// end: '''str'''''
  696. /// 6 or more at the end, however, is an error.
  697. if lx.peek() == rawStringEnd {
  698. /// Check if we already lexed 5 's; if so we have 6 now, and
  699. /// that's just too many man!
  700. if strings.HasSuffix(lx.current(), "'''''") {
  701. return lx.errorf(`unexpected "''''''"`)
  702. }
  703. lx.backup()
  704. lx.backup()
  705. return lexMultilineRawString
  706. }
  707. lx.backup() /// backup: don't include the ''' in the item.
  708. lx.backup()
  709. lx.backup()
  710. lx.emit(itemRawMultilineString)
  711. lx.next() /// Read over ''' again and discard it.
  712. lx.next()
  713. lx.next()
  714. lx.ignore()
  715. return lx.pop()
  716. }
  717. lx.backup()
  718. }
  719. }
  720. if isControl(r) {
  721. return lx.errorf("control characters are not allowed inside strings: '0x%02x'", r)
  722. }
  723. return lexMultilineRawString
  724. }
  725. // lexMultilineStringEscape consumes an escaped character. It assumes that the
  726. // preceding '\\' has already been consumed.
  727. func lexMultilineStringEscape(lx *lexer) stateFn {
  728. // Handle the special case first:
  729. if isNL(lx.next()) {
  730. return lexMultilineString
  731. }
  732. lx.backup()
  733. lx.push(lexMultilineString)
  734. return lexStringEscape(lx)
  735. }
  736. func lexStringEscape(lx *lexer) stateFn {
  737. r := lx.next()
  738. switch r {
  739. case 'b':
  740. fallthrough
  741. case 't':
  742. fallthrough
  743. case 'n':
  744. fallthrough
  745. case 'f':
  746. fallthrough
  747. case 'r':
  748. fallthrough
  749. case '"':
  750. fallthrough
  751. case ' ', '\t':
  752. // Inside """ .. """ strings you can use \ to escape newlines, and any
  753. // amount of whitespace can be between the \ and \n.
  754. fallthrough
  755. case '\\':
  756. return lx.pop()
  757. case 'u':
  758. return lexShortUnicodeEscape
  759. case 'U':
  760. return lexLongUnicodeEscape
  761. }
  762. return lx.errorf("invalid escape character %q; only the following escape characters are allowed: "+
  763. `\b, \t, \n, \f, \r, \", \\, \uXXXX, and \UXXXXXXXX`, r)
  764. }
  765. func lexShortUnicodeEscape(lx *lexer) stateFn {
  766. var r rune
  767. for i := 0; i < 4; i++ {
  768. r = lx.next()
  769. if !isHexadecimal(r) {
  770. return lx.errorf(
  771. `expected four hexadecimal digits after '\u', but got %q instead`,
  772. lx.current())
  773. }
  774. }
  775. return lx.pop()
  776. }
  777. func lexLongUnicodeEscape(lx *lexer) stateFn {
  778. var r rune
  779. for i := 0; i < 8; i++ {
  780. r = lx.next()
  781. if !isHexadecimal(r) {
  782. return lx.errorf(
  783. `expected eight hexadecimal digits after '\U', but got %q instead`,
  784. lx.current())
  785. }
  786. }
  787. return lx.pop()
  788. }
  789. // lexNumberOrDateStart processes the first character of a value which begins
  790. // with a digit. It exists to catch values starting with '0', so that
  791. // lexBaseNumberOrDate can differentiate base prefixed integers from other
  792. // types.
  793. func lexNumberOrDateStart(lx *lexer) stateFn {
  794. r := lx.next()
  795. switch r {
  796. case '0':
  797. return lexBaseNumberOrDate
  798. }
  799. if !isDigit(r) {
  800. // The only way to reach this state is if the value starts
  801. // with a digit, so specifically treat anything else as an
  802. // error.
  803. return lx.errorf("expected a digit but got %q", r)
  804. }
  805. return lexNumberOrDate
  806. }
  807. // lexNumberOrDate consumes either an integer, float or datetime.
  808. func lexNumberOrDate(lx *lexer) stateFn {
  809. r := lx.next()
  810. if isDigit(r) {
  811. return lexNumberOrDate
  812. }
  813. switch r {
  814. case '-', ':':
  815. return lexDatetime
  816. case '_':
  817. return lexDecimalNumber
  818. case '.', 'e', 'E':
  819. return lexFloat
  820. }
  821. lx.backup()
  822. lx.emit(itemInteger)
  823. return lx.pop()
  824. }
  825. // lexDatetime consumes a Datetime, to a first approximation.
  826. // The parser validates that it matches one of the accepted formats.
  827. func lexDatetime(lx *lexer) stateFn {
  828. r := lx.next()
  829. if isDigit(r) {
  830. return lexDatetime
  831. }
  832. switch r {
  833. case '-', ':', 'T', 't', ' ', '.', 'Z', 'z', '+':
  834. return lexDatetime
  835. }
  836. lx.backup()
  837. lx.emitTrim(itemDatetime)
  838. return lx.pop()
  839. }
  840. // lexHexInteger consumes a hexadecimal integer after seeing the '0x' prefix.
  841. func lexHexInteger(lx *lexer) stateFn {
  842. r := lx.next()
  843. if isHexadecimal(r) {
  844. return lexHexInteger
  845. }
  846. switch r {
  847. case '_':
  848. return lexHexInteger
  849. }
  850. lx.backup()
  851. lx.emit(itemInteger)
  852. return lx.pop()
  853. }
  854. // lexOctalInteger consumes an octal integer after seeing the '0o' prefix.
  855. func lexOctalInteger(lx *lexer) stateFn {
  856. r := lx.next()
  857. if isOctal(r) {
  858. return lexOctalInteger
  859. }
  860. switch r {
  861. case '_':
  862. return lexOctalInteger
  863. }
  864. lx.backup()
  865. lx.emit(itemInteger)
  866. return lx.pop()
  867. }
  868. // lexBinaryInteger consumes a binary integer after seeing the '0b' prefix.
  869. func lexBinaryInteger(lx *lexer) stateFn {
  870. r := lx.next()
  871. if isBinary(r) {
  872. return lexBinaryInteger
  873. }
  874. switch r {
  875. case '_':
  876. return lexBinaryInteger
  877. }
  878. lx.backup()
  879. lx.emit(itemInteger)
  880. return lx.pop()
  881. }
  882. // lexDecimalNumber consumes a decimal float or integer.
  883. func lexDecimalNumber(lx *lexer) stateFn {
  884. r := lx.next()
  885. if isDigit(r) {
  886. return lexDecimalNumber
  887. }
  888. switch r {
  889. case '.', 'e', 'E':
  890. return lexFloat
  891. case '_':
  892. return lexDecimalNumber
  893. }
  894. lx.backup()
  895. lx.emit(itemInteger)
  896. return lx.pop()
  897. }
  898. // lexDecimalNumber consumes the first digit of a number beginning with a sign.
  899. // It assumes the sign has already been consumed. Values which start with a sign
  900. // are only allowed to be decimal integers or floats.
  901. //
  902. // The special "nan" and "inf" values are also recognized.
  903. func lexDecimalNumberStart(lx *lexer) stateFn {
  904. r := lx.next()
  905. // Special error cases to give users better error messages
  906. switch r {
  907. case 'i':
  908. if !lx.accept('n') || !lx.accept('f') {
  909. return lx.errorf("invalid float: '%s'", lx.current())
  910. }
  911. lx.emit(itemFloat)
  912. return lx.pop()
  913. case 'n':
  914. if !lx.accept('a') || !lx.accept('n') {
  915. return lx.errorf("invalid float: '%s'", lx.current())
  916. }
  917. lx.emit(itemFloat)
  918. return lx.pop()
  919. case '0':
  920. p := lx.peek()
  921. switch p {
  922. case 'b', 'o', 'x':
  923. return lx.errorf("cannot use sign with non-decimal numbers: '%s%c'", lx.current(), p)
  924. }
  925. case '.':
  926. return lx.errorf("floats must start with a digit, not '.'")
  927. }
  928. if isDigit(r) {
  929. return lexDecimalNumber
  930. }
  931. return lx.errorf("expected a digit but got %q", r)
  932. }
  933. // lexBaseNumberOrDate differentiates between the possible values which
  934. // start with '0'. It assumes that before reaching this state, the initial '0'
  935. // has been consumed.
  936. func lexBaseNumberOrDate(lx *lexer) stateFn {
  937. r := lx.next()
  938. // Note: All datetimes start with at least two digits, so we don't
  939. // handle date characters (':', '-', etc.) here.
  940. if isDigit(r) {
  941. return lexNumberOrDate
  942. }
  943. switch r {
  944. case '_':
  945. // Can only be decimal, because there can't be an underscore
  946. // between the '0' and the base designator, and dates can't
  947. // contain underscores.
  948. return lexDecimalNumber
  949. case '.', 'e', 'E':
  950. return lexFloat
  951. case 'b':
  952. r = lx.peek()
  953. if !isBinary(r) {
  954. lx.errorf("not a binary number: '%s%c'", lx.current(), r)
  955. }
  956. return lexBinaryInteger
  957. case 'o':
  958. r = lx.peek()
  959. if !isOctal(r) {
  960. lx.errorf("not an octal number: '%s%c'", lx.current(), r)
  961. }
  962. return lexOctalInteger
  963. case 'x':
  964. r = lx.peek()
  965. if !isHexadecimal(r) {
  966. lx.errorf("not a hexidecimal number: '%s%c'", lx.current(), r)
  967. }
  968. return lexHexInteger
  969. }
  970. lx.backup()
  971. lx.emit(itemInteger)
  972. return lx.pop()
  973. }
  974. // lexFloat consumes the elements of a float. It allows any sequence of
  975. // float-like characters, so floats emitted by the lexer are only a first
  976. // approximation and must be validated by the parser.
  977. func lexFloat(lx *lexer) stateFn {
  978. r := lx.next()
  979. if isDigit(r) {
  980. return lexFloat
  981. }
  982. switch r {
  983. case '_', '.', '-', '+', 'e', 'E':
  984. return lexFloat
  985. }
  986. lx.backup()
  987. lx.emit(itemFloat)
  988. return lx.pop()
  989. }
  990. // lexBool consumes a bool string: 'true' or 'false.
  991. func lexBool(lx *lexer) stateFn {
  992. var rs []rune
  993. for {
  994. r := lx.next()
  995. if !unicode.IsLetter(r) {
  996. lx.backup()
  997. break
  998. }
  999. rs = append(rs, r)
  1000. }
  1001. s := string(rs)
  1002. switch s {
  1003. case "true", "false":
  1004. lx.emit(itemBool)
  1005. return lx.pop()
  1006. }
  1007. return lx.errorf("expected value but found %q instead", s)
  1008. }
  1009. // lexCommentStart begins the lexing of a comment. It will emit
  1010. // itemCommentStart and consume no characters, passing control to lexComment.
  1011. func lexCommentStart(lx *lexer) stateFn {
  1012. lx.ignore()
  1013. lx.emit(itemCommentStart)
  1014. return lexComment
  1015. }
  1016. // lexComment lexes an entire comment. It assumes that '#' has been consumed.
  1017. // It will consume *up to* the first newline character, and pass control
  1018. // back to the last state on the stack.
  1019. func lexComment(lx *lexer) stateFn {
  1020. switch r := lx.next(); {
  1021. case isNL(r) || r == eof:
  1022. lx.backup()
  1023. lx.emit(itemText)
  1024. return lx.pop()
  1025. case isControl(r):
  1026. return lx.errorf("control characters are not allowed inside comments: '0x%02x'", r)
  1027. default:
  1028. return lexComment
  1029. }
  1030. }
  1031. // lexSkip ignores all slurped input and moves on to the next state.
  1032. func lexSkip(lx *lexer, nextState stateFn) stateFn {
  1033. lx.ignore()
  1034. return nextState
  1035. }
  1036. // isWhitespace returns true if `r` is a whitespace character according
  1037. // to the spec.
  1038. func isWhitespace(r rune) bool {
  1039. return r == '\t' || r == ' '
  1040. }
  1041. func isNL(r rune) bool {
  1042. return r == '\n' || r == '\r'
  1043. }
  1044. // Control characters except \n, \t
  1045. func isControl(r rune) bool {
  1046. switch r {
  1047. case '\t', '\r', '\n':
  1048. return false
  1049. default:
  1050. return (r >= 0x00 && r <= 0x1f) || r == 0x7f
  1051. }
  1052. }
  1053. func isDigit(r rune) bool {
  1054. return r >= '0' && r <= '9'
  1055. }
  1056. func isHexadecimal(r rune) bool {
  1057. return (r >= '0' && r <= '9') ||
  1058. (r >= 'a' && r <= 'f') ||
  1059. (r >= 'A' && r <= 'F')
  1060. }
  1061. func isOctal(r rune) bool {
  1062. return r >= '0' && r <= '7'
  1063. }
  1064. func isBinary(r rune) bool {
  1065. return r == '0' || r == '1'
  1066. }
  1067. func isBareKeyChar(r rune) bool {
  1068. return (r >= 'A' && r <= 'Z') ||
  1069. (r >= 'a' && r <= 'z') ||
  1070. (r >= '0' && r <= '9') ||
  1071. r == '_' ||
  1072. r == '-'
  1073. }
  1074. func (s stateFn) String() string {
  1075. name := runtime.FuncForPC(reflect.ValueOf(s).Pointer()).Name()
  1076. if i := strings.LastIndexByte(name, '.'); i > -1 {
  1077. name = name[i+1:]
  1078. }
  1079. if s == nil {
  1080. name = "<nil>"
  1081. }
  1082. return name + "()"
  1083. }
  1084. func (itype itemType) String() string {
  1085. switch itype {
  1086. case itemError:
  1087. return "Error"
  1088. case itemNIL:
  1089. return "NIL"
  1090. case itemEOF:
  1091. return "EOF"
  1092. case itemText:
  1093. return "Text"
  1094. case itemString, itemRawString, itemMultilineString, itemRawMultilineString:
  1095. return "String"
  1096. case itemBool:
  1097. return "Bool"
  1098. case itemInteger:
  1099. return "Integer"
  1100. case itemFloat:
  1101. return "Float"
  1102. case itemDatetime:
  1103. return "DateTime"
  1104. case itemTableStart:
  1105. return "TableStart"
  1106. case itemTableEnd:
  1107. return "TableEnd"
  1108. case itemKeyStart:
  1109. return "KeyStart"
  1110. case itemKeyEnd:
  1111. return "KeyEnd"
  1112. case itemArray:
  1113. return "Array"
  1114. case itemArrayEnd:
  1115. return "ArrayEnd"
  1116. case itemCommentStart:
  1117. return "CommentStart"
  1118. case itemInlineTableStart:
  1119. return "InlineTableStart"
  1120. case itemInlineTableEnd:
  1121. return "InlineTableEnd"
  1122. }
  1123. panic(fmt.Sprintf("BUG: Unknown type '%d'.", int(itype)))
  1124. }
  1125. func (item item) String() string {
  1126. return fmt.Sprintf("(%s, %s)", item.typ.String(), item.val)
  1127. }