tokenizer.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. #!/usr/bin/env python
  2. # __author__ = "Ronie Martinez"
  3. # __copyright__ = "Copyright 2018-2019, Ronie Martinez"
  4. # __credits__ = ["Ronie Martinez"]
  5. # __maintainer__ = "Ronie Martinez"
  6. # __email__ = "ronmarti18@gmail.com"
  7. def tokenize(data):
  8. iterable = iter(data)
  9. buffer = ''
  10. while True:
  11. try:
  12. char = next(iterable)
  13. if char == '\\':
  14. if buffer == '\\':
  15. yield buffer + char
  16. buffer = ''
  17. continue
  18. elif len(buffer):
  19. yield buffer
  20. buffer = char
  21. try:
  22. buffer += next(iterable)
  23. except StopIteration:
  24. break
  25. elif char.isalpha():
  26. if len(buffer):
  27. if buffer.endswith('}'):
  28. yield buffer
  29. yield char
  30. buffer = ''
  31. elif buffer.startswith('\\'):
  32. buffer += char
  33. else:
  34. yield char
  35. elif char.isdigit():
  36. if len(buffer):
  37. yield buffer
  38. buffer = char
  39. while True:
  40. try:
  41. char = next(iterable)
  42. except StopIteration:
  43. break
  44. if char.isspace():
  45. yield buffer
  46. buffer = ''
  47. break
  48. elif char.isdigit() or char == '.':
  49. buffer += char
  50. else:
  51. if buffer.endswith('.'):
  52. yield buffer[:-1]
  53. yield buffer[-1]
  54. else:
  55. yield buffer
  56. buffer = ''
  57. if char == '\\':
  58. buffer = char
  59. else:
  60. yield char
  61. break
  62. elif char.isspace():
  63. if len(buffer):
  64. yield buffer
  65. buffer = ''
  66. elif char in '{}*':
  67. if buffer.startswith(r'\begin') or buffer.startswith(r'\end'):
  68. if buffer.endswith('}'):
  69. yield buffer
  70. yield char
  71. buffer = ''
  72. else:
  73. buffer += char
  74. else:
  75. if len(buffer):
  76. yield buffer
  77. buffer = ''
  78. yield char
  79. else:
  80. if len(buffer):
  81. yield buffer
  82. buffer = ''
  83. if len(char):
  84. yield char
  85. except StopIteration:
  86. break
  87. if len(buffer):
  88. yield buffer