text_file.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. """text_file
  2. provides the TextFile class, which gives an interface to text files
  3. that (optionally) takes care of stripping comments, ignoring blank
  4. lines, and joining lines with backslashes."""
  5. import sys
  6. class TextFile:
  7. """Provides a file-like object that takes care of all the things you
  8. commonly want to do when processing a text file that has some
  9. line-by-line syntax: strip comments (as long as "#" is your
  10. comment character), skip blank lines, join adjacent lines by
  11. escaping the newline (ie. backslash at end of line), strip
  12. leading and/or trailing whitespace. All of these are optional
  13. and independently controllable.
  14. Provides a 'warn()' method so you can generate warning messages that
  15. report physical line number, even if the logical line in question
  16. spans multiple physical lines. Also provides 'unreadline()' for
  17. implementing line-at-a-time lookahead.
  18. Constructor is called as:
  19. TextFile (filename=None, file=None, **options)
  20. It bombs (RuntimeError) if both 'filename' and 'file' are None;
  21. 'filename' should be a string, and 'file' a file object (or
  22. something that provides 'readline()' and 'close()' methods). It is
  23. recommended that you supply at least 'filename', so that TextFile
  24. can include it in warning messages. If 'file' is not supplied,
  25. TextFile creates its own using 'io.open()'.
  26. The options are all boolean, and affect the value returned by
  27. 'readline()':
  28. strip_comments [default: true]
  29. strip from "#" to end-of-line, as well as any whitespace
  30. leading up to the "#" -- unless it is escaped by a backslash
  31. lstrip_ws [default: false]
  32. strip leading whitespace from each line before returning it
  33. rstrip_ws [default: true]
  34. strip trailing whitespace (including line terminator!) from
  35. each line before returning it
  36. skip_blanks [default: true}
  37. skip lines that are empty *after* stripping comments and
  38. whitespace. (If both lstrip_ws and rstrip_ws are false,
  39. then some lines may consist of solely whitespace: these will
  40. *not* be skipped, even if 'skip_blanks' is true.)
  41. join_lines [default: false]
  42. if a backslash is the last non-newline character on a line
  43. after stripping comments and whitespace, join the following line
  44. to it to form one "logical line"; if N consecutive lines end
  45. with a backslash, then N+1 physical lines will be joined to
  46. form one logical line.
  47. collapse_join [default: false]
  48. strip leading whitespace from lines that are joined to their
  49. predecessor; only matters if (join_lines and not lstrip_ws)
  50. errors [default: 'strict']
  51. error handler used to decode the file content
  52. Note that since 'rstrip_ws' can strip the trailing newline, the
  53. semantics of 'readline()' must differ from those of the builtin file
  54. object's 'readline()' method! In particular, 'readline()' returns
  55. None for end-of-file: an empty string might just be a blank line (or
  56. an all-whitespace line), if 'rstrip_ws' is true but 'skip_blanks' is
  57. not."""
  58. default_options = {
  59. 'strip_comments': 1,
  60. 'skip_blanks': 1,
  61. 'lstrip_ws': 0,
  62. 'rstrip_ws': 1,
  63. 'join_lines': 0,
  64. 'collapse_join': 0,
  65. 'errors': 'strict',
  66. }
  67. def __init__(self, filename=None, file=None, **options):
  68. """Construct a new TextFile object. At least one of 'filename'
  69. (a string) and 'file' (a file-like object) must be supplied.
  70. They keyword argument options are described above and affect
  71. the values returned by 'readline()'."""
  72. if filename is None and file is None:
  73. raise RuntimeError(
  74. "you must supply either or both of 'filename' and 'file'"
  75. )
  76. # set values for all options -- either from client option hash
  77. # or fallback to default_options
  78. for opt in self.default_options.keys():
  79. if opt in options:
  80. setattr(self, opt, options[opt])
  81. else:
  82. setattr(self, opt, self.default_options[opt])
  83. # sanity check client option hash
  84. for opt in options.keys():
  85. if opt not in self.default_options:
  86. raise KeyError("invalid TextFile option '%s'" % opt)
  87. if file is None:
  88. self.open(filename)
  89. else:
  90. self.filename = filename
  91. self.file = file
  92. self.current_line = 0 # assuming that file is at BOF!
  93. # 'linebuf' is a stack of lines that will be emptied before we
  94. # actually read from the file; it's only populated by an
  95. # 'unreadline()' operation
  96. self.linebuf = []
  97. def open(self, filename):
  98. """Open a new file named 'filename'. This overrides both the
  99. 'filename' and 'file' arguments to the constructor."""
  100. self.filename = filename
  101. self.file = open(self.filename, errors=self.errors)
  102. self.current_line = 0
  103. def close(self):
  104. """Close the current file and forget everything we know about it
  105. (filename, current line number)."""
  106. file = self.file
  107. self.file = None
  108. self.filename = None
  109. self.current_line = None
  110. file.close()
  111. def gen_error(self, msg, line=None):
  112. outmsg = []
  113. if line is None:
  114. line = self.current_line
  115. outmsg.append(self.filename + ", ")
  116. if isinstance(line, (list, tuple)):
  117. outmsg.append("lines %d-%d: " % tuple(line))
  118. else:
  119. outmsg.append("line %d: " % line)
  120. outmsg.append(str(msg))
  121. return "".join(outmsg)
  122. def error(self, msg, line=None):
  123. raise ValueError("error: " + self.gen_error(msg, line))
  124. def warn(self, msg, line=None):
  125. """Print (to stderr) a warning message tied to the current logical
  126. line in the current file. If the current logical line in the
  127. file spans multiple physical lines, the warning refers to the
  128. whole range, eg. "lines 3-5". If 'line' supplied, it overrides
  129. the current line number; it may be a list or tuple to indicate a
  130. range of physical lines, or an integer for a single physical
  131. line."""
  132. sys.stderr.write("warning: " + self.gen_error(msg, line) + "\n")
  133. def readline(self): # noqa: C901
  134. """Read and return a single logical line from the current file (or
  135. from an internal buffer if lines have previously been "unread"
  136. with 'unreadline()'). If the 'join_lines' option is true, this
  137. may involve reading multiple physical lines concatenated into a
  138. single string. Updates the current line number, so calling
  139. 'warn()' after 'readline()' emits a warning about the physical
  140. line(s) just read. Returns None on end-of-file, since the empty
  141. string can occur if 'rstrip_ws' is true but 'strip_blanks' is
  142. not."""
  143. # If any "unread" lines waiting in 'linebuf', return the top
  144. # one. (We don't actually buffer read-ahead data -- lines only
  145. # get put in 'linebuf' if the client explicitly does an
  146. # 'unreadline()'.
  147. if self.linebuf:
  148. line = self.linebuf[-1]
  149. del self.linebuf[-1]
  150. return line
  151. buildup_line = ''
  152. while True:
  153. # read the line, make it None if EOF
  154. line = self.file.readline()
  155. if line == '':
  156. line = None
  157. if self.strip_comments and line:
  158. # Look for the first "#" in the line. If none, never
  159. # mind. If we find one and it's the first character, or
  160. # is not preceded by "\", then it starts a comment --
  161. # strip the comment, strip whitespace before it, and
  162. # carry on. Otherwise, it's just an escaped "#", so
  163. # unescape it (and any other escaped "#"'s that might be
  164. # lurking in there) and otherwise leave the line alone.
  165. pos = line.find("#")
  166. if pos == -1: # no "#" -- no comments
  167. pass
  168. # It's definitely a comment -- either "#" is the first
  169. # character, or it's elsewhere and unescaped.
  170. elif pos == 0 or line[pos - 1] != "\\":
  171. # Have to preserve the trailing newline, because it's
  172. # the job of a later step (rstrip_ws) to remove it --
  173. # and if rstrip_ws is false, we'd better preserve it!
  174. # (NB. this means that if the final line is all comment
  175. # and has no trailing newline, we will think that it's
  176. # EOF; I think that's OK.)
  177. eol = (line[-1] == '\n') and '\n' or ''
  178. line = line[0:pos] + eol
  179. # If all that's left is whitespace, then skip line
  180. # *now*, before we try to join it to 'buildup_line' --
  181. # that way constructs like
  182. # hello \\
  183. # # comment that should be ignored
  184. # there
  185. # result in "hello there".
  186. if line.strip() == "":
  187. continue
  188. else: # it's an escaped "#"
  189. line = line.replace("\\#", "#")
  190. # did previous line end with a backslash? then accumulate
  191. if self.join_lines and buildup_line:
  192. # oops: end of file
  193. if line is None:
  194. self.warn("continuation line immediately precedes " "end-of-file")
  195. return buildup_line
  196. if self.collapse_join:
  197. line = line.lstrip()
  198. line = buildup_line + line
  199. # careful: pay attention to line number when incrementing it
  200. if isinstance(self.current_line, list):
  201. self.current_line[1] = self.current_line[1] + 1
  202. else:
  203. self.current_line = [self.current_line, self.current_line + 1]
  204. # just an ordinary line, read it as usual
  205. else:
  206. if line is None: # eof
  207. return None
  208. # still have to be careful about incrementing the line number!
  209. if isinstance(self.current_line, list):
  210. self.current_line = self.current_line[1] + 1
  211. else:
  212. self.current_line = self.current_line + 1
  213. # strip whitespace however the client wants (leading and
  214. # trailing, or one or the other, or neither)
  215. if self.lstrip_ws and self.rstrip_ws:
  216. line = line.strip()
  217. elif self.lstrip_ws:
  218. line = line.lstrip()
  219. elif self.rstrip_ws:
  220. line = line.rstrip()
  221. # blank line (whether we rstrip'ed or not)? skip to next line
  222. # if appropriate
  223. if (line == '' or line == '\n') and self.skip_blanks:
  224. continue
  225. if self.join_lines:
  226. if line[-1] == '\\':
  227. buildup_line = line[:-1]
  228. continue
  229. if line[-2:] == '\\\n':
  230. buildup_line = line[0:-2] + '\n'
  231. continue
  232. # well, I guess there's some actual content there: return it
  233. return line
  234. def readlines(self):
  235. """Read and return the list of all logical lines remaining in the
  236. current file."""
  237. lines = []
  238. while True:
  239. line = self.readline()
  240. if line is None:
  241. return lines
  242. lines.append(line)
  243. def unreadline(self, line):
  244. """Push 'line' (a string) onto an internal buffer that will be
  245. checked by future 'readline()' calls. Handy for implementing
  246. a parser with line-at-a-time lookahead."""
  247. self.linebuf.append(line)