/* Copyright 2017, Kelvin Ma (“taylorswift”), kelvin13ma@gmail.com This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . */ //Modified by izackp@gmail.com to include an api where I can read everything in pieces #if os(Linux) import Glibc #elseif os(OSX) import Darwin #endif final class EmptyStringError: Error, Sendable { init() { } } final class XmlError: Error, Sendable { let message: String let line:Int let column:Int init(_ message: String, _ line:Int, _ column:Int) { self.message = "\(line):\(column) \(message)" self.line = line self.column = column } static func failure(_ message: String, _ line:Int, _ column:Int) -> Result { return .failure(XmlError(message, line, column)) } var errorDescription: String? { get { return message } } } public enum XMLToken { case data([Unicode.Scalar]), tag_start(String, [String: String]), tag_empty(String, [String: String]), tag_end(String), instruction(String, [Unicode.Scalar]) } extension Unicode.Scalar { // NameStartChar ::= ":" | [A-Z] | "_" | [a-z] | [#xC0-#xD6] | [#xD8-#xF6] | [#xF8-#x2FF] // | [#x370-#x37D] | [#x37F-#x1FFF] | [#x200C-#x200D] | [#x2070-#x218F] // | [#x2C00-#x2FEF] | [#x3001-#xD7FF] | [#xF900-#xFDCF] | [#xFDF0-#xFFFD] // | [#x10000-#xEFFFF] var is_xml_name_start:Bool { return "a" ... "z" ~= self || "A" ... "Z" ~= self // [a-z], [A-Z] || self == ":" || self == "_" // ":", "_" || "\u{C0}" ... "\u{D6}" ~= self || "\u{D8}" ... "\u{F6}" ~= self || "\u{F8}" ... "\u{2FF}" ~= self || "\u{370}" ... "\u{37D}" ~= self || "\u{37F}" ... "\u{1FFF}" ~= self || "\u{200C}" ... "\u{200D}" ~= self || "\u{2070}" ... "\u{218F}" ~= self || "\u{2C00}" ... "\u{2FEF}" ~= self || "\u{3001}" ... "\u{D7FF}" ~= self || "\u{F900}" ... "\u{FDCF}" ~= self || "\u{FDF0}" ... "\u{FFFD}" ~= self || "\u{10000}" ... "\u{EFFFF}" ~= self } // NameChar ::= NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040] var is_xml_name:Bool { return "a" ... "z" ~= self || "A" ... "Z" ~= self || "0" ... ":" ~= self || self == "_" || self == "-" || self == "." || self == "\u{B7}" || "\u{0300}" ... "\u{036F}" ~= self || "\u{203F}" ... "\u{2040}" ~= self || "\u{C0}" ... "\u{D6}" ~= self || "\u{D8}" ... "\u{F6}" ~= self || "\u{F8}" ... "\u{2FF}" ~= self || "\u{370}" ... "\u{37D}" ~= self || "\u{37F}" ... "\u{1FFF}" ~= self || "\u{200C}" ... "\u{200D}" ~= self || "\u{2070}" ... "\u{218F}" ~= self || "\u{2C00}" ... "\u{2FEF}" ~= self || "\u{3001}" ... "\u{D7FF}" ~= self || "\u{F900}" ... "\u{FDCF}" ~= self || "\u{FDF0}" ... "\u{FFFD}" ~= self || "\u{10000}" ... "\u{EFFFF}" ~= self } // S ::= (#x20 | #x9 | #xD | #xA)+ var is_xml_whitespace:Bool { return self == " " || self == "\u{9}" || self == "\u{D}" || self == "\u{A}" } } extension String { init(_ buffer:C) where C:Collection, C.Element == Unicode.Scalar { self.init(buffer.map(Character.init)) } } enum State { case data, begin_markup, slash1, name(Unicode.Scalar), attributes, no_attributes, label(Unicode.Scalar), space1, equals, space2, attribute_value, slash2, end_markup, exclam, hyphen1, comment, hyphen2, hyphen3, question1, pi_space, pi_data(Unicode.Scalar), question2 } enum Markup { case none, start, empty, end, comment, processing } struct Position { var line:Int = 0, column:Int = 0 @inline(__always) mutating func advance(_ u:Unicode.Scalar) { if u == "\n" { self.line += 1 self.column = 0 } else { self.column += 1 } } } extension String.UnicodeScalarView.Iterator { mutating func read_reference(position:inout Position) -> (after:Unicode.Scalar?, content:[Unicode.Scalar], error:String?) { enum ReferenceState { case initial, name, hashtag, x, decimal(UInt32), hex(UInt32) } let default_entities:[String: [Unicode.Scalar]] = ["amp": ["&"], "lt": ["<"], "gt": [">"], "apos": ["'"], "quot": ["\""]] var state:ReferenceState = .initial, content:[Unicode.Scalar] = ["&"] @inline(__always) func _charref(_ u:Unicode.Scalar, scalar:UInt32) -> (after:Unicode.Scalar?, content:[Unicode.Scalar], error:String?) { guard scalar > 0 else { return (u, content, "cannot reference null character '\\0'") } guard scalar <= 0xD7FF || 0xE000 ... 0xFFFD ~= scalar || 0x10000 ... 0x10FFFF ~= scalar else { return (u, content, "cannot reference illegal character '\\u{\(scalar)}'") } position.advance(u) return (self.next(), [Unicode.Scalar(scalar)!], nil) } while let u:Unicode.Scalar = self.next() { switch state { case .initial: if u == "#" { state = .hashtag } else if u.is_xml_name_start { state = .name } else { return (u, content, "unescaped ampersand '&'") } case .name: if u == ";" { content = default_entities[String(content.dropFirst())] ?? content position.advance(u) return (self.next(), content, nil) } else { guard u.is_xml_name else { return (u, content, "unexpected '\(u)' in entity reference") } } case .hashtag: if "0" ... "9" ~= u { state = .decimal(u.value - Unicode.Scalar("0").value) } else if u == "x" { state = .x } else { return (u, content, "unexpected '\(u)' in character reference") } case .decimal(let scalar): if "0" ... "9" ~= u { state = .decimal(u.value - Unicode.Scalar("0").value + 10 * scalar) } else if u == ";" { return _charref(u, scalar: scalar) } else { return (u, content, "unexpected '\(u)' in character reference") } case .x: if "0" ... "9" ~= u { state = .hex(u.value - Unicode.Scalar("0").value) } else if "a" ... "f" ~= u { state = .hex(10 + u.value - Unicode.Scalar("a").value) } else if "A" ... "F" ~= u { state = .hex(10 + u.value - Unicode.Scalar("A").value) } else { return (u, content, "unexpected '\(u)' in character reference") } case .hex(let scalar): if "0" ... "9" ~= u { state = .hex(u.value - Unicode.Scalar("0").value + scalar << 4) } else if "a" ... "f" ~= u { state = .hex(10 + u.value - Unicode.Scalar("a").value + scalar << 4) } else if "A" ... "F" ~= u { state = .hex(10 + u.value - Unicode.Scalar("A").value + scalar << 4) } else if u == ";" { return _charref(u, scalar: scalar) } else { return (u, content, "unexpected '\(u)' in character reference") } } position.advance(u) content.append(u) } return (nil, content, "unexpected EOF inside reference") } } public class XMLParser { /* private static func posix_path(_ path:String) -> String { guard let first_char:Character = path.first else { return path } var expanded_path:String = path if first_char == "~" { if expanded_path.count == 1 || expanded_path[expanded_path.index(after: expanded_path.startIndex)] == "/" { expanded_path = String(cString: getenv("HOME")) + String(expanded_path.dropFirst()) } } return expanded_path } private mutating func open_text_file(_ posix_path:String) -> String? { guard let f:UnsafeMutablePointer = fopen(posix_path, "rb") else { self.handle_error("could not open file stream '\(posix_path)'", line: 0, column: 0) return nil } defer { fclose(f) } let fseek_status:CInt = fseek(f, 0, SEEK_END) guard fseek_status == 0 else { self.handle_error("fseek() on file '\(posix_path)' failed with error code \(fseek_status)", line: 0, column: 0) return nil } let n:CLong = ftell(f) guard 0 ..< CLong.max ~= n else { self.handle_error("ftell() on file '\(posix_path)' returned too large file size (\(n) bytes)", line: 0, column: 0) return nil } rewind(f) let buffer:UnsafeMutablePointer = UnsafeMutablePointer.allocate(capacity: n + 1) // leave room for sentinel defer { buffer.deallocate(capacity: n + 1) } let n_read = fread(buffer, MemoryLayout.size, n, f) guard n_read == n else { self.handle_error("fread() on file '\(posix_path)' read \(n_read) characters out of \(n)", line: 0, column: n_read) return nil } buffer[n] = 0 // cap with sentinel return String(cString: buffer) }*/ init?(str:String) { iterator = str.unicodeScalars.makeIterator() position = Position() } var iterator:String.UnicodeScalarView.Iterator var state:State = .end_markup, markup_context:Markup = .none var name_buffer:[Unicode.Scalar] = [], label_buffer:[Unicode.Scalar] = [], attributes:[String: String] = [:], string_delimiter:Unicode.Scalar = "\0" var position:Position //var u:Unicode.Scalar @inline(__always) func _emit_tag() -> XMLToken? { switch markup_context { case .none: return nil case .start: return .tag_start(String(name_buffer), attributes) case .empty: return .tag_empty(String(name_buffer), attributes) case .end: return .tag_end(String(name_buffer)) case .comment: return nil case .processing: let copy = label_buffer label_buffer = [] return .instruction(String(name_buffer), copy) } } var reuseLastCharacter:Unicode.Scalar? = nil public func readToken() throws -> XMLToken? { while true { let u:Unicode.Scalar if let next = reuseLastCharacter { u = next reuseLastCharacter = nil } else { guard let u_after:Unicode.Scalar = iterator.next() else { switch state { case .end_markup: return nil case .data: state = .end_markup return nil default: throw XmlError("unexpected end of stream inside markup structure", position.line, position.column) } } u = u_after position.advance(u) } fsm: switch state { case .end_markup: markup_context = .none name_buffer = [] attributes = [:] if u == "<" { state = .begin_markup } else { state = .data reuseLastCharacter = u continue } case .data: var u_current:Unicode.Scalar = u var data_buffer:[Unicode.Scalar] = [] while u_current != "<" { let u_next:Unicode.Scalar? if u_current == "&" { let content:[Unicode.Scalar] let error:String? (u_next, content, error) = iterator.read_reference(position: &position) data_buffer.append(contentsOf: content) position.advance(u_current) if let error_message:String = error { throw XmlError(error_message, position.line, position.column) } } else { data_buffer.append(u_current) position.advance(u_current) u_next = iterator.next() } guard let u_after:Unicode.Scalar = u_next else { return XMLToken.data(data_buffer) } u_current = u_after } state = .begin_markup if (data_buffer.count > 0) { return XMLToken.data(data_buffer) } case .begin_markup: markup_context = .start if u.is_xml_name_start { state = .name(u) } else if u == "/" { state = .slash1 } else if u == "!" { state = .exclam } else if u == "?" { state = .question1 } else { throw XmlError("unexpected '\(u)' after left angle bracket '<'", position.line, position.column) } case .slash1: markup_context = .end guard u.is_xml_name_start else { throw XmlError("unexpected '\(u)' in end tag '\(String(name_buffer))'", position.line, position.column) } state = .name(u) case .name(let u_previous): name_buffer.append(u_previous) if u.is_xml_name { state = .name(u) break } if markup_context == .start { if u.is_xml_whitespace { state = .attributes } else if u == "/" { state = .slash2 } else if u == ">" { state = .end_markup } else { throw XmlError("unexpected '\(u)' in start tag '\(String(name_buffer))'", position.line, position.column) } } else if markup_context == .end { if u.is_xml_whitespace { state = .no_attributes } else if u == ">" { state = .end_markup } else { throw XmlError("unexpected '\(u)' in end tag '\(String(name_buffer))'", position.line, position.column) } } else if markup_context == .processing { if u.is_xml_whitespace { state = .pi_space } else if u == "?" { state = .question2 } else { throw XmlError("unexpected '\(u)' in processing instruction '\(String(name_buffer))'", position.line, position.column) } } case .attributes: if u.is_xml_name_start { state = .label(u) } else if u == "/" { state = .slash2 } else if u == ">" { state = .end_markup } else { guard u.is_xml_whitespace else { throw XmlError("unexpected '\(u)' in end tag '\(String(name_buffer))'", position.line, position.column) } } case .no_attributes: if u == ">" { state = .end_markup } else { guard u.is_xml_whitespace else { if u.is_xml_name_start { throw XmlError("end tag '\(String(name_buffer))' cannot contain attributes", position.line, position.column) } else { throw XmlError("unexpected '\(u)' in end tag '\(String(name_buffer))'", position.line, position.column) } } } case .label(let u_previous): label_buffer.append(u_previous) if u.is_xml_name { state = .label(u) } else if u == "=" { state = .equals } else if u == "/" { let label_str:String = String(label_buffer) guard attributes[label_str] == nil else { throw XmlError("redefinition of attribute '\(label_str)'", position.line, position.column) } attributes[label_str] = "" label_buffer = [] state = .slash2 } else if u == ">" { let label_str:String = String(label_buffer) guard attributes[label_str] == nil else { throw XmlError("redefinition of attribute '\(label_str)'", position.line, position.column) } attributes[label_str] = "" label_buffer = [] state = .end_markup } else { guard u.is_xml_whitespace else { throw XmlError("unexpected '\(u)' in start tag '\(String(name_buffer))'", position.line, position.column) } state = .space1 } case .space1: if u == "=" { state = .equals } else if u == "/" { state = .slash2 } else if u == ">" { state = .end_markup } else { if (!u.is_xml_whitespace) { let label_str:String = String(label_buffer) guard attributes[label_str] == nil else { throw XmlError("redefinition of attribute '\(label_str)'", position.line, position.column) } attributes[label_str] = "" label_buffer = [] reuseLastCharacter = u state = .attributes continue } } case .equals: if u == "\"" || u == "'" { string_delimiter = u state = .attribute_value } else { guard u.is_xml_whitespace else { throw XmlError("unexpected '\(u)' in start tag '\(String(name_buffer))'", position.line, position.column) } state = .space2 } case .space2: if u == "\"" || u == "'" { string_delimiter = u state = .attribute_value } else { guard u.is_xml_whitespace else { throw XmlError("unexpected '\(u)' in start tag '\(String(name_buffer))'", position.line, position.column) } } case .attribute_value: var u_current:Unicode.Scalar = u, value_buffer:[Unicode.Scalar] = [] while u_current != string_delimiter { let u_next:Unicode.Scalar? if u_current == "&" { let content:[Unicode.Scalar], error:String? (u_next, content, error) = iterator.read_reference(position: &position) value_buffer.append(contentsOf: content) position.advance(u_current) if let error_message:String = error { throw XmlError(error_message, position.line, position.column) } } else { value_buffer.append(u_current) u_next = iterator.next() position.advance(u_current) } guard let u_after:Unicode.Scalar = u_next else { throw XmlError("unexpected end of stream inside of attribute", position.line, position.column) } u_current = u_after } string_delimiter = "\0" let label_str:String = String(label_buffer) guard attributes[label_str] == nil else { throw XmlError("redefinition of attribute '\(label_str)'", position.line, position.column) } attributes[label_str] = String(value_buffer) label_buffer = [] value_buffer = [] state = .attributes case .slash2: markup_context = .empty guard u == ">" else { throw XmlError("unexpected '\(u)' in empty tag '\(String(name_buffer))'", position.line, position.column) } state = .end_markup case .exclam: if u == "-" { state = .hyphen1 } else { state = .name(u) } case .hyphen1: guard u == "-" else { throw XmlError("unexpected '\(u)' after '" else { throw XmlError("unexpected double hyphen '--' inside comment body", position.line, position.column - 1) } state = .end_markup case .question1: markup_context = .processing guard u.is_xml_name_start else { throw XmlError("unexpected '\(u)' after '" { state = .end_markup } else { label_buffer.append("?") state = .pi_data(u) } } if case .end_markup = state { return _emit_tag() } } } }