Skip to main content

csv_legacy/
reader.rs

1use alloc::{
2    borrow::Cow,
3    string::{String, ToString},
4    vec::Vec,
5};
6use core::{ops::Index, str};
7
8use crate::error::{ReadError, ReadErrorKind};
9
10#[derive(Copy, Clone, Debug)]
11struct FieldRange {
12    start: usize,
13    end: usize,
14    quoted: bool,
15}
16
17#[derive(Copy, Clone, Debug, PartialEq)]
18enum State {
19    StartOfField,
20    InUnquoted,
21    InQuoted,
22    AfterQuote,
23}
24
25/// Parses CSV data from a byte slice or a `std::io::Read` source.
26///
27/// Create an iterator over rows with [`Reader::rows`]:
28///
29/// ```no_run
30/// # use csv_legacy::Reader;
31/// let data = b"name,age\nAlice,30\nBob,25\n";
32/// for row in Reader::new(data).rows() {
33///     let row = row?;
34///     // ...
35/// }
36/// # Ok::<_, csv_legacy::ReadError>(())
37/// ```
38pub struct Reader {
39    buf: Vec<u8>,
40    pos: usize,
41    field_ranges: Vec<FieldRange>,
42    field_start: usize,
43    field_start_column: usize,
44    state: State,
45    line: usize,
46    column: usize,
47    delimiter: u8,
48    flexible: bool,
49    eof: bool,
50
51    #[cfg(feature = "std")]
52    source: Option<Box<dyn std::io::Read>>,
53}
54
55impl Reader {
56    /// Create a reader from a byte slice. The data is copied into
57    /// an internal buffer. This constructor is available with or
58    /// without the `std` feature.
59    pub fn new(data: &[u8]) -> Self {
60        Reader {
61            buf: data.to_vec(),
62            pos: 0,
63            field_ranges: Vec::new(),
64            field_start: 0,
65            field_start_column: 1,
66            state: State::StartOfField,
67            line: 1,
68            column: 1,
69            delimiter: b',',
70            flexible: false,
71            eof: true,
72            #[cfg(feature = "std")]
73            source: None,
74        }
75    }
76
77    /// Set the field delimiter byte (default is `,`).
78    pub fn set_delimiter(&mut self, byte: u8) -> &mut Self {
79        self.delimiter = byte;
80        self
81    }
82
83    /// Allow variable numbers of fields per row (default `false`).
84    pub fn set_flexible(&mut self, yes: bool) -> &mut Self {
85        self.flexible = yes;
86        self
87    }
88
89    /// Read all rows from this CSV source.
90    ///
91    /// Consumes the reader and returns an iterator yielding
92    /// [`Result<Row, ReadError>`].
93    ///
94    /// # Example
95    ///
96    /// ```no_run
97    /// # use csv_legacy::Reader;
98    /// let data = b"a,b,c\n1,2,3\n";
99    /// let mut total = 0usize;
100    /// for row in Reader::new(data).rows() {
101    ///     total += row?.len();
102    /// }
103    /// assert_eq!(total, 6);
104    /// # Ok::<_, csv_legacy::ReadError>(())
105    /// ```
106    pub fn rows(self) -> Rows {
107        Rows {
108            reader: self,
109        }
110    }
111
112    fn read_row(&mut self) -> Result<Option<Row>, ReadError> {
113        self.field_ranges.clear();
114        self.state = State::StartOfField;
115
116        loop {
117            if self.pos >= self.buf.len() && !self.fill_buf()? {
118                return if self.field_ranges.is_empty() && self.state == State::StartOfField {
119                    Ok(None)
120                } else {
121                    if self.state == State::InQuoted {
122                        return Err(ReadError::new(
123                            ReadErrorKind::UnterminatedQuote,
124                            self.line,
125                            self.column_at(self.field_start),
126                        ));
127                    }
128                    match self.state {
129                        State::InUnquoted => {
130                            self.field_ranges.push(FieldRange {
131                                start: self.field_start,
132                                end: self.pos,
133                                quoted: false,
134                            });
135                        }
136                        State::AfterQuote => {
137                            self.field_ranges.push(FieldRange {
138                                start: self.field_start,
139                                end: self.pos,
140                                quoted: true,
141                            });
142                        }
143                        State::StartOfField => {
144                            if !self.field_ranges.is_empty() {
145                                self.field_ranges.push(FieldRange {
146                                    start: self.pos,
147                                    end: self.pos,
148                                    quoted: false,
149                                });
150                            }
151                        }
152                        State::InQuoted => unreachable!(),
153                    }
154                    Ok(Some(self.make_row()?))
155                };
156            }
157
158            if self.pos >= self.buf.len() {
159                break;
160            }
161
162            let byte = self.buf[self.pos];
163
164            match self.state {
165                State::StartOfField => {
166                    if byte == b'\r' || byte == b'\n' {
167                        if !self.field_ranges.is_empty() {
168                            self.field_ranges.push(FieldRange {
169                                start: self.pos,
170                                end: self.pos,
171                                quoted: false,
172                            });
173                        }
174                        self.consume_line_end();
175                        if self.field_ranges.is_empty() {
176                            continue;
177                        }
178                        return Ok(Some(self.make_row()?));
179                    }
180                    if byte == self.delimiter {
181                        self.field_ranges.push(FieldRange {
182                            start: self.pos,
183                            end: self.pos,
184                            quoted: false,
185                        });
186                        self.pos += 1;
187                        self.column += 1;
188                        continue;
189                    }
190                    if byte == b'"' {
191                        self.field_start = self.pos;
192                        self.field_start_column = self.column;
193                        self.state = State::InQuoted;
194                        self.pos += 1;
195                        self.column += 1;
196                    } else {
197                        self.field_start = self.pos;
198                        self.field_start_column = self.column;
199                        self.state = State::InUnquoted;
200                        self.pos += 1;
201                        self.column += 1;
202                    }
203                }
204
205                State::InUnquoted => {
206                    if byte == self.delimiter {
207                        self.field_ranges.push(FieldRange {
208                            start: self.field_start,
209                            end: self.pos,
210                            quoted: false,
211                        });
212                        self.state = State::StartOfField;
213                        self.pos += 1;
214                        self.column = 1;
215                    } else if byte == b'\r' || byte == b'\n' {
216                        self.field_ranges.push(FieldRange {
217                            start: self.field_start,
218                            end: self.pos,
219                            quoted: false,
220                        });
221                        self.consume_line_end();
222                        return Ok(Some(self.make_row()?));
223                    } else {
224                        self.pos += 1;
225                        self.column += 1;
226                    }
227                }
228
229                State::InQuoted => {
230                    if byte == b'"' {
231                        self.state = State::AfterQuote;
232                        self.pos += 1;
233                        self.column += 1;
234                    } else {
235                        self.pos += 1;
236                        self.column += 1;
237                    }
238                }
239
240                State::AfterQuote => {
241                    if byte == b'"' {
242                        self.state = State::InQuoted;
243                        self.pos += 1;
244                        self.column += 1;
245                    } else if byte == self.delimiter {
246                        self.field_ranges.push(FieldRange {
247                            start: self.field_start,
248                            end: self.pos,
249                            quoted: true,
250                        });
251                        self.state = State::StartOfField;
252                        self.pos += 1;
253                        self.column = 1;
254                    } else if byte == b'\r' || byte == b'\n' {
255                        self.field_ranges.push(FieldRange {
256                            start: self.field_start,
257                            end: self.pos,
258                            quoted: true,
259                        });
260                        self.consume_line_end();
261                        return Ok(Some(self.make_row()?));
262                    } else {
263                        return Err(ReadError::new(
264                            ReadErrorKind::TrailingContent,
265                            self.line,
266                            self.column_at(self.field_start),
267                        ));
268                    }
269                }
270            }
271        }
272
273        Ok(None)
274    }
275
276    fn make_row(&mut self) -> Result<Row, ReadError> {
277        let ranges = core::mem::take(&mut self.field_ranges);
278        if ranges.is_empty() {
279            return Ok(Row {
280                input: String::new(),
281                fields: Vec::new(),
282            });
283        }
284        let buf_start = ranges[0].start;
285        let buf_end = ranges.last().unwrap().end;
286        let raw = self.buf[buf_start..buf_end].to_vec();
287        let input = String::from_utf8(raw).map_err(|_| ReadError::new(ReadErrorKind::InvalidUtf8, self.line, 0))?;
288        let fields: Vec<FieldRange> = ranges
289            .iter()
290            .map(|r| FieldRange {
291                start: r.start - buf_start,
292                end: r.end - buf_start,
293                quoted: r.quoted,
294            })
295            .collect();
296        if self.pos > 0 {
297            self.buf.drain(..self.pos);
298            self.pos = 0;
299        }
300        Ok(Row {
301            input,
302            fields,
303        })
304    }
305
306    fn consume_line_end(&mut self) {
307        if self.pos < self.buf.len() && self.buf[self.pos] == b'\r' {
308            self.pos += 1;
309        }
310        if self.pos < self.buf.len() && self.buf[self.pos] == b'\n' {
311            self.pos += 1;
312        }
313        self.line += 1;
314        self.column = 1;
315    }
316
317    fn fill_buf(&mut self) -> Result<bool, ReadError> {
318        if self.eof {
319            return Ok(false);
320        }
321        #[cfg(feature = "std")]
322        {
323            if let Some(source) = &mut self.source {
324                let mut tmp = [0u8; 8192];
325                let n = source.read(&mut tmp)?;
326                if n == 0 {
327                    self.eof = true;
328                    return Ok(false);
329                }
330                self.buf.extend_from_slice(&tmp[..n]);
331                return Ok(true);
332            }
333        }
334        Ok(false)
335    }
336
337    fn column_at(&self, _pos: usize) -> usize {
338        self.field_start_column
339    }
340}
341
342#[cfg(feature = "std")]
343impl Reader {
344    /// Create a reader from any `std::io::Read` source. Data is streamed
345    /// in chunks as rows are read, without loading the entire input.
346    ///
347    /// # Example
348    ///
349    /// ```no_run
350    /// # use std::fs::File;
351    /// # use csv_legacy::Reader;
352    /// let file = File::open("data.csv").unwrap();
353    /// for row in Reader::from_reader(file).rows() {
354    ///     let row = row?;
355    ///     // ...
356    /// }
357    /// # Ok::<_, csv_legacy::ReadError>(())
358    /// ```
359    pub fn from_reader(reader: impl std::io::Read + 'static) -> Self {
360        Reader {
361            buf: Vec::new(),
362            pos: 0,
363            field_ranges: Vec::new(),
364            field_start: 0,
365            field_start_column: 1,
366            state: State::StartOfField,
367            line: 1,
368            column: 1,
369            delimiter: b',',
370            flexible: false,
371            eof: false,
372            source: Some(Box::new(reader)),
373        }
374    }
375}
376
377/// A single row of CSV data.
378///
379/// A `Row` owns its data, so it can outlive the reader used to create it.
380/// Fields are validated as UTF-8 at parse time, so all access methods
381/// return `&str`.
382///
383/// Raw fields (including surrounding quotes) are accessed via [`get_raw`].
384/// Unescaped fields (quotes stripped, `""` resolved) are accessed via
385/// [`fields`].
386///
387/// [`get_raw`]: Row::get_raw
388/// [`fields`]: Row::fields
389#[derive(Clone, Debug)]
390pub struct Row {
391    input: String,
392    fields: Vec<FieldRange>,
393}
394
395impl Row {
396    /// Number of fields in this row.
397    pub fn len(&self) -> usize {
398        self.fields.len()
399    }
400
401    /// Returns `true` if the row has no fields.
402    pub fn is_empty(&self) -> bool {
403        self.fields.is_empty()
404    }
405
406    /// Return the raw field at `index`, or `None` if out of bounds.
407    /// Includes surrounding quotes for quoted fields.
408    pub fn get_raw(&self, index: usize) -> Option<&str> {
409        let range = self.fields.get(index)?;
410        Some(&self.input[range.start..range.end])
411    }
412
413    /// Iterate over unescaped fields, yielding `Cow<str>`.
414    ///
415    /// For quoted fields, surrounding quotes are stripped and `""`
416    /// escape sequences are resolved to a single `"`. Returns
417    /// `Cow::Borrowed` when no escaping is needed.
418    pub fn fields(&self) -> Fields<'_> {
419        Fields {
420            input: &self.input,
421            ranges: self.fields.iter(),
422        }
423    }
424}
425
426impl Index<usize> for Row {
427    type Output = str;
428
429    fn index(&self, index: usize) -> &str {
430        self.get_raw(index).expect("Row index out of bounds")
431    }
432}
433
434/// An owning iterator over raw field strings in a [`Row`].
435///
436/// Created by iterating over a `Row` by value.
437pub struct RowIntoIter {
438    input: String,
439    ranges: alloc::vec::IntoIter<FieldRange>,
440}
441
442impl Iterator for RowIntoIter {
443    type Item = String;
444
445    fn next(&mut self) -> Option<String> {
446        let range = self.ranges.next()?;
447        Some(self.input[range.start..range.end].to_string())
448    }
449
450    fn size_hint(&self) -> (usize, Option<usize>) {
451        self.ranges.size_hint()
452    }
453}
454
455impl ExactSizeIterator for RowIntoIter {
456    fn len(&self) -> usize {
457        self.ranges.len()
458    }
459}
460
461impl IntoIterator for Row {
462    type Item = String;
463    type IntoIter = RowIntoIter;
464
465    fn into_iter(self) -> RowIntoIter {
466        RowIntoIter {
467            input: self.input,
468            ranges: self.fields.into_iter(),
469        }
470    }
471}
472
473/// Iterator over unescaped fields in a [`Row`], yielding `Cow<str>`.
474///
475/// Created by [`Row::fields`]. Returns `Cow::Borrowed` when the field
476/// requires no escaping (the common case), and `Cow::Owned` only when
477/// `""` escape sequences in quoted fields need to be resolved.
478pub struct Fields<'a> {
479    input: &'a str,
480    ranges: core::slice::Iter<'a, FieldRange>,
481}
482
483impl<'a> Iterator for Fields<'a> {
484    type Item = Cow<'a, str>;
485
486    fn next(&mut self) -> Option<Cow<'a, str>> {
487        let range = self.ranges.next()?;
488        let raw = &self.input[range.start..range.end];
489
490        if range.quoted {
491            if raw.len() < 2 {
492                return Some(Cow::Borrowed(""));
493            }
494            let content = &raw[1..raw.len() - 1];
495            if content.contains("\"\"") {
496                Some(Cow::Owned(content.replace("\"\"", "\"")))
497            } else {
498                Some(Cow::Borrowed(content))
499            }
500        } else {
501            Some(Cow::Borrowed(raw))
502        }
503    }
504
505    fn size_hint(&self) -> (usize, Option<usize>) {
506        self.ranges.size_hint()
507    }
508}
509
510impl<'a> ExactSizeIterator for Fields<'a> {
511    fn len(&self) -> usize {
512        self.ranges.len()
513    }
514}
515
516/// An iterator over the rows of a CSV reader.
517///
518/// Created by [`Reader::rows`]. Each item is a [`Result<Row, ReadError>`]
519/// so errors from malformed CSV data are surfaced per row.
520///
521/// # Example
522///
523/// ```no_run
524/// # use csv_legacy::Reader;
525/// let data = b"a,b,c\n1,2,3\n";
526/// for result in Reader::new(data).rows() {
527///     match result {
528///         Ok(row) => println!("got {} fields", row.len()),
529///         Err(e) => eprintln!("error at line {}: {e}", e.line),
530///     }
531/// }
532/// ```
533pub struct Rows {
534    reader: Reader,
535}
536
537impl Iterator for Rows {
538    type Item = Result<Row, ReadError>;
539
540    fn next(&mut self) -> Option<Self::Item> {
541        match self.reader.read_row() {
542            Ok(Some(row)) => Some(Ok(row)),
543            Ok(None) => None,
544            Err(e) => Some(Err(e)),
545        }
546    }
547}