ProteoWizard
SAXParser.hpp
Go to the documentation of this file.
1//
2// $Id$
3//
4//
5// Original author: Darren Kessner <darren@proteowizard.org>
6//
7// Copyright 2007 Spielberg Family Center for Applied Proteomics
8// Cedars-Sinai Medical Center, Los Angeles, California 90048
9//
10// Reworked for zero-copy performance by Brian Pratt, Insilicos LLC
11// those changes Copyright 2011 Insilicos LLC All Rights Reserved
12//
13// Licensed under the Apache License, Version 2.0 (the "License");
14// you may not use this file except in compliance with the License.
15// You may obtain a copy of the License at
16//
17// http://www.apache.org/licenses/LICENSE-2.0
18//
19// Unless required by applicable law or agreed to in writing, software
20// distributed under the License is distributed on an "AS IS" BASIS,
21// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22// See the License for the specific language governing permissions and
23// limitations under the License.
24//
25
26
27#ifndef _SAXPARSER_HPP_
28#define _SAXPARSER_HPP_
29
33#include "boost/iostreams/positioning.hpp"
34#include <string.h>
35#include <iosfwd>
36#include <string>
37#include <vector>
38#include <assert.h>
39#include <stdexcept>
40
41
42namespace pwiz {
43namespace minimxml {
44
45
46///
47/// An extended SAX interface for custom XML stream parsing.
48///
49/// Use cases:
50/// - read a single element
51/// - read a single element, aborting on a specified tag
52/// - delegate handling of a sub-element to another handler
53///
54namespace SAXParser {
55
56PWIZ_API_DECL size_t count_trail_ws(const char *data,size_t len); // count whitespace chars at end of data
57PWIZ_API_DECL void unescapeXML(char *str);
58PWIZ_API_DECL void unescapeXML(std::string &str);
59
61{
62 // simple string management for zero-copy saxparser
63 //
64 // not using std::string due to overhead with:
65 // reference counts
66 // exception unwinding
67 // etc etc
68 //
69 // provides for zero-copy trimming of whitespace
70 //
71public:
72 saxstring(size_t size = 0) {
73 init(size);
74 }
75
77 *this = rhs;
78 }
79
80 saxstring(const std::string &rhs) {
81 init(rhs.length());
82 memcpy(data(),rhs.c_str(),rhs.length());
83 (*this)[rhs.length()] = 0; // nullterm
84 }
85
86 void unescapeXML() {
87 if (strchr(c_str(),'&')) {
89 resize(strlen(c_str()));
90 }
91 }
92
94 free(_data);
95 }
96
98 init(rhs.length());
99 if (length()) {
100 memcpy(data(),rhs.c_str(),length()+1);
101 }
102 return *this;
103 }
104
105 saxstring & operator = (const char *rhs) {
106 init(rhs ? strlen(rhs) : 0);
107 if (length()) {
108 memcpy(data(),rhs,length()+1);
109 }
110 return *this;
111 }
112
114 if (rhs.length()) {
115 size_t oldsize = length();
116 resize(rhs.length()+oldsize);
117 memcpy(data()+oldsize,rhs.c_str(),rhs.length()+1);
118 }
119 return *this;
120 }
121
122 saxstring & operator += (const char *rhs) {
123 size_t rhslen = rhs?strlen(rhs):0;
124 if (rhslen) {
125 size_t oldsize = length();
126 resize(rhslen+oldsize);
127 strcpy(data()+oldsize,rhs);
128 }
129 return *this;
130 }
131
132 bool operator == (const char *c) const {
133 return c && !strcmp(c,c_str());
134 }
135
136 bool operator == (const std::string &s) const {
137 return !strcmp(c_str(),s.c_str());
138 }
139
140 bool operator == (const saxstring &s) const {
141 return !strcmp(c_str(),s.c_str());
142 }
143
144 char *resize(size_t size) {
145 if (!size) {
146 _lead = 0; // empty, reclaim the start of buffer
147 }
148 size_t new_used = size + _lead; // translate to "used" space
149 if (new_used >= _capacity) {
150 _data = (char *)realloc(_data, (_used = new_used)+1);
151 if (_used && !_data) {
152 throw std::runtime_error("SAXParser: cannot allocate memory");
153 }
155 } else {
156 _used = new_used;
157 }
158 _data[_used] = 0;
159 return _data;
160 }
161 void clear() {
162 resize(0);
163 }
164 inline const char *c_str() const {
165 return _data?_data+_lead:"";
166 }
167 inline char & operator [](size_t n) {
168 return *(data()+n);
169 }
170 inline size_t length() const {
171 return _used-_lead;
172 }
173 inline size_t capacity() const {
174 return _capacity;
175 }
176 void trim_trail_ws() { // remove trailing whitespace if any
177 size_t n = count_trail_ws(c_str(),length());
178 resize(length()-n);
179 }
180 // returns number of ws chars it had to eat on front end
182 size_t n=0;
183 for (const char *c=c_str(); *c && strchr(" \n\r\t",*c); c++) {
184 n++;
185 }
186 _lead += n;
187 return n;
188 }
189 bool starts_with(const char *txt) const {
190 return !strncmp(c_str(),txt,strlen(txt));
191 }
192 bool ends_with(const char *txt) const {
193 size_t len = strlen(txt);
194 return (len <= length()) ? (!strcmp(c_str()+length()-len,txt)) : false;
195 }
196 char *data() { // direct access to data buffer
197 if (!_data) {
198 resize(0);
199 }
200 return _data+_lead;
201 }
202private:
203 void init(size_t size) {
204 _used = 0;
205 _lead = 0;
206 _capacity = 0;
207 _data = NULL;
208 if (size) {
209 resize(size);
210 }
211 }
212 char * _data; // char buf
213 size_t _used; // characters used
214 size_t _lead; // for skipping whitespace
215 size_t _capacity; // max characters (always >_used)
216};
217
218inline std::ostream& operator<<(std::ostream& os, const saxstring& s)
219{
220 os << s.c_str();
221 return os;
222}
223
224// fast string-to-value conversions
225// not very boost-y, or even very c++, but lexical_cast and istringstreams are
226// just too slow for our parsing performance needs.
227template< typename Target > inline Target textToValue(const char *txt); // template prototype
228
229template<> inline float textToValue(const char *txt)
230{
231 return (float) ATOF( txt ) ;
232}
233
234template<> inline double textToValue(const char *txt)
235{
236 return ATOF( txt );
237}
238
239template<> inline int textToValue(const char *txt)
240{
241 return atoi(txt);
242}
243
244template<> inline char textToValue(const char *txt)
245{
246 return *(txt);
247}
248
249template<> inline long textToValue(const char *txt)
250{
251 return atol(txt);
252}
253
254template<> inline unsigned int textToValue(const char *txt)
255{
256 return (unsigned int) strtoul( txt, NULL, 10 );
257}
258
259template<> inline unsigned long textToValue(const char *txt)
260{
261 return strtoul( txt, NULL, 10 );
262}
263
264#if defined(BOOST_HAS_LONG_LONG)
265
266template<> inline long long textToValue(const char *txt)
267{
268#if defined(BOOST_HAS_MS_INT64)
269 return _atoi64(txt);
270#else
271 return atoll(txt);
272#endif
273}
274
275template<> inline unsigned long long textToValue(const char *txt)
276{
277#if defined(BOOST_HAS_MS_INT64)
278 return _strtoui64(txt,NULL,10);
279#else
280 return strtoull( txt, NULL, 10 );
281#endif
282}
283
284#endif // has long long
285
286inline bool istrue(const char *t)
287{
288 return strcmp(t, "0") && strcmp(t,"false"); // as in optimized_lexical_cast.h
289}
290
291template<> inline bool textToValue(const char *txt)
292{
293 return istrue(txt);
294}
295
296template<> inline boost::logic::tribool textToValue(const char *txt)
297{
298 using namespace boost::logic;
299 if (!*txt)
300 return tribool(indeterminate);
301 else
302 {
303 bool b = istrue(txt);
304 return tribool(b);
305 }
306}
307
308template<> inline std::string textToValue(const char *txt)
309{
310 return std::string( txt );
311}
312
313
314/// SAX event handler interface.
316{
317 public:
318
319 /// When false, no calls to characters() will be made
321
322 /// Setting these to false will disable the auto-unescaping feature of the parser;
323 /// this is useful for handlers which deal with large amounts of data
325
326 /// contextual version available to control handler logic which support multiple versions of a schema;
327 /// the default value 0 indicates handler should ignore the version;
328 /// the handler determines the meaning of any non-zero value
330
331 /// Handler returns the Status struct as a means of changing the parser's behavior.
332 struct Status
333 {
334 enum Flag
335 {
336 Ok, // ok, continue parsing the stream
337 Done, // abort immediately
338 Delegate // delegate this element to the specified Handler [startElement() only]
339 };
340
342 Handler* delegate; // valid iff (flag == Delegate)
343
344 Status(Flag _flag = Ok,
345 Handler* _delegate = 0)
346 : flag(_flag), delegate(_delegate)
347 {}
348 };
349
352 {
353 // lazy evaluation - doesn't process text until asked
354 // near-zero copy - copies the source text just once,
355 // instead of a bunch of little std::string operations
356 public:
357 Attributes(const char * _source_text, size_t _source_text_len, bool _autoUnescape) :
358 index(0),index_end(0),autoUnescape(_autoUnescape),firstread(true),attrs()
359 {
360 size=_source_text_len;
361 textbuff = (char *)malloc(size+1);
362 managemem = true;
363 memcpy(textbuff,_source_text,size);
364 textbuff[size] = 0;
365 setParserIndex(); // ready for eventual parsing
366 test_invariant(); // everything correct?
367 };
369 index(0),index_end(0),autoUnescape(false),firstread(true),attrs()
370 {
371 size=0;
372 textbuff = NULL;
373 managemem = true;
374 test_invariant(); // everything correct?
375 };
376 Attributes(saxstring &str, bool _autoUnescape) :
377 index(0),index_end(0),autoUnescape(_autoUnescape),firstread(true),attrs()
378 {
379 textbuff = str.data();
380 size=str.length();
381 managemem = false; // we don't have to free this
382 setParserIndex(); // ready for eventual parsing
383 test_invariant(); // everything correct?
384 };
386 {
387 if (managemem)
388 free(textbuff);
389 }
391 {
392 textbuff = NULL;
393 *this = rhs;
394 }
396 size = rhs.size;
397 index = rhs.index;
398 index_end = rhs.index_end; // string bounds for attribute parsing
399 autoUnescape = rhs.autoUnescape; // do XML escape of attribute?
400 firstread = rhs.firstread; // may change during const access
401 if (managemem)
402 textbuff = (char *)realloc(textbuff,size+1);
403 else
404 textbuff = (char *)malloc(size+1);
405 managemem = true; // we need to free textbuff at dtor
406 memcpy(textbuff,rhs.textbuff,size+1);
407 attrs.resize(rhs.attrs.size());
408 // now fix up the char ptrs to point to our copy of attribute list
409 for (size_t n=attrs.size();n--;)
410 {
411 attrs[n].name = ((char *)textbuff)+(rhs.attrs[n].getName()-rhs.getTextBuffer());
412 attrs[n].value = ((char *)textbuff)+(rhs.attrs[n].getValuePtr()-rhs.getTextBuffer());
413 }
414 test_invariant(); // everything correct?
415 return *this;
416 }
417
418 inline void test_invariant() const
419 {
420#ifdef _DEBUG
421 for (size_t n=attrs.size();n--;)
422 {
423 assert(textbuff != NULL);
424 assert(attrs[n].name>textbuff);
425 assert(attrs[n].value>attrs[n].name);
426 assert(attrs[n].value<textbuff+size);
427 if (n)
428 assert(attrs[n].name>attrs[n-1].value);
429 }
430#endif
431 }
432
433 const char *getTagName() const
434 { // work area contains tag name
435 test_invariant(); // everything correct?
436 return textbuff+('/'==*textbuff);
437 }
438 const char *getTextBuffer() const
439 { // return pointer to our work area
440 test_invariant(); // everything correct?
441 return textbuff;
442 }
443 size_t getSize() const
444 {
445 return size;
446 }
447 protected:
448 mutable char *textbuff; // we'll operate on this copy of string
449 size_t size;
450 mutable size_t index,index_end; // string bounds for attribute parsing
451 bool autoUnescape; // do XML escape of attribute?
452 bool managemem; // if true we need to free on exit
453 mutable bool firstread; // may change during const access
454
456 {
457 // on entry, buffer has form "foo bar="baz" or maybe "foo/"
458 const char *c = textbuff;
459 while (*c && !strchr(" \n\r\t/",*c)) c++;
460 size_t indexNameEnd = c-textbuff;
461 while (*c && strchr(" \n\r\t",*c)) c++;
462 textbuff[indexNameEnd] = 0; // nullterm the name
463 index = c-textbuff; // should point to bar
464 index_end = size;
465 test_invariant(); // everything correct?
466 }
467 public:
469 {
470 // a set of pointers into the main text buffer - going for zero copy, for speed
471 public:
473 bool matchName(const char *test) const
474 {
475 return !strcmp(test,name); // return true on match
476 }
477 const char *getName() const
478 {
479 return name;
480 }
481
482 // handle XML escapes on demand
484 {
485 if (Unescape == NoXMLUnescape)
486 needsUnescape = false;
487 else if (needsUnescape) {
489 needsUnescape = false;
490 }
491 return value;
492 }
493 std::string getValue(XMLUnescapeBehavior_t Unescape = XMLUnescapeDefault) const {
494 return std::string(getValuePtr(Unescape));
495 }
496
497 // cast-to-type
498 template< typename T >
499 inline T valueAs( XMLUnescapeBehavior_t Unescape ) const
500 {
501 return textToValue<T>(getValuePtr(Unescape));
502 }
503
504 inline size_t valueAs( XMLUnescapeBehavior_t Unescape ) const
505 {
506 return (size_t)strtoul(getValuePtr(Unescape),NULL,10);
507 }
508
509 friend class Attributes;
510 protected:
511 const char *name; // attribute name - a pointer into main text buffer
512 char *value; // also a pointer into main text buffer, content may change during read
513 mutable bool needsUnescape; // may change during read
514 void set(const char *_name, char *_value, bool _needsUnescape)
515 {
516 name = _name;
517 value = _value;
518 needsUnescape = _needsUnescape;
519 }
520 }; // class attribute
521
522 public:
523 typedef std::vector<attribute> attribute_list;
524 protected:
525 mutable attribute_list attrs; // may change even in a const function due to lazy evaluation
526 public:
527 attribute_list::const_iterator begin() const
528 {
529 access(); // have we actually parsed the attributes text yet?
530 return attrs.begin();
531 }
532 attribute_list::const_iterator end() const
533 {
534 access(); // have we actually parsed the attributes text yet?
535 return attrs.end();
536 }
537 attribute_list::const_iterator find(const std::string &name) const
538 {
539 attribute_list::const_iterator it;
540 for (it = begin(); it != end() ; it++ )
541 {
542 if (it->matchName(name.c_str()))
543 break; // found it
544 }
545 return it;
546 }
547 protected:
548
549 PWIZ_API_DECL void parseAttributes(std::string::size_type& index) const;
550
551 void access() const
552 { // don't parse attributes until asked to
553 test_invariant(); // everything correct?
554 if (firstread) {
555 firstread = false;
557 }
558 test_invariant(); // everything correct?
559 }
560
561 public:
562 const attribute *findAttributeByName(const char *name) const
563 {
564 access(); // parse the buffer if we haven't already
565 for (attribute_list::const_iterator it=attrs.begin();it!=attrs.end();it++)
566 {
567 if (it->matchName(name))
568 return &(*it);
569 }
570 return NULL;
571 }
572
573 // return value for name if any, or NULL
574 const char *findValueByName(const char *name,XMLUnescapeBehavior_t Unescape = XMLUnescapeDefault) const
575 {
576 const attribute *attr = findAttributeByName(name);
577 if (attr)
578 return attr->getValuePtr(Unescape);
579 return NULL;
580 }
581
582 };
583 typedef boost::iostreams::stream_offset stream_offset;
584
585 virtual Status processingInstruction(const std::string& name,
586 const std::string& data,
587 stream_offset position) {return Status::Ok;}
588
589 virtual Status startElement(const std::string& name,
590 const Attributes& attributes,
591 stream_offset position) {return Status::Ok;}
592
593 virtual Status endElement(const std::string& name,
594 stream_offset position) {return Status::Ok;}
595
597 stream_offset position) {return Status::Ok;}
598
600 virtual ~Handler(){}
601
602 protected:
603
604 template <typename T>
605 inline T& getAttribute(const Attributes& attributes,
606 const char * name,
607 T& result,
608 XMLUnescapeBehavior_t Unescape,
609 T defaultValue = T()) const
610 {
611 const Attributes::attribute *attr = attributes.findAttributeByName(name);
612 if (attr)
613 result = attr->valueAs<T>(Unescape);
614 else
615 result = defaultValue;
616 return result;
617 }
618
619 const char *getAttribute(const Attributes& attributes,
620 const char * name,
621 XMLUnescapeBehavior_t Unescape,
622 const char * defaultValue = NULL) const
623 {
624 const char *val = attributes.findValueByName(name,Unescape);
625 if (!val)
626 val = defaultValue;
627 return val;
628 }
629
630
631 // general case using default unescape behavior
632 template <typename T>
633 inline T& getAttribute(const Attributes& attributes,
634 const char *name,
635 T& result) const
636 {
637 const Attributes::attribute *attr = attributes.findAttributeByName(name);
638 if (attr)
639 result = attr->valueAs<T>(XMLUnescapeDefault);
640 else
641 result = T();
642 return result;
643 }
644
645 inline std::string& getAttribute(const Attributes& attributes,
646 const char *name,
647 std::string& result) const
648 {
649 const Attributes::attribute *attr = attributes.findAttributeByName(name);
650 if (attr)
651 result = attr->getValuePtr(XMLUnescapeDefault);
652 else
653 result = "";
654 return result;
655 }
656
657 // general case using default unescape behavior
658 template <typename T>
659 inline T& getAttribute(const Attributes& attributes,
660 const std::string &name,
661 T& result,
662 T defaultValue = T()) const
663 {
664 const Attributes::attribute *attr = attributes.findAttributeByName(name.c_str());
665 if (attr)
666 result = attr->valueAs<T>(XMLUnescapeDefault);
667 else
668 result = defaultValue;
669 return result;
670 }
671};
672
673
674///
675/// Extract a single XML element from the istream, sending SAX events to the handler.
676///
677/// Behavior:
678///
679/// - Parser returns when it completes reading of the first element it encounters.
680///
681/// - Parser returns immediately if the Handler returns Status::Done when handling an event.
682///
683/// - On startElement(), Handler may delegate handling to a sub-Handler, which will receive
684/// the same startElement() event. The sub-Handler pointer will remain on the parser's
685/// Handler stack until it handles the corresponding endElement(). Caution: The sub-Handler
686/// pointer must remain valid while it is on the Handler stack, so it cannot point to
687/// a local object that goes out of scope when Handler:startElement() returns.
688///
689/// Notes:
690/// - Start tags with end marker '/' generate two events, e.g. <br/> will generate events
691/// startElement("br", ...) and endElement("br").
692///
693PWIZ_API_DECL void parse(std::istream& is, Handler& handler);
694
695
696} // namespace SAXParser
697
698
699/// Returns the root element from an XML buffer;
700/// throws runtime_error if no element is found.
701PWIZ_API_DECL std::string xml_root_element(const std::string& fileheader);
702
703/// Returns the root element from an XML stream;
704/// throws runtime_error if no element is found.
705PWIZ_API_DECL std::string xml_root_element(std::istream& is);
706
707/// Returns the root element from an XML file;
708/// throws runtime_error if no element is found.
709PWIZ_API_DECL std::string xml_root_element_from_file(const std::string& filepath);
710
711
712/// Decodes any characters encoded with their hexadecimal value,
713/// e.g. "_x0020_" decodes as " "
714/// This override modifies the input string in place and returns its reference.
715PWIZ_API_DECL std::string& decode_xml_id(std::string& str);
716
717
718/// Decodes any characters encoded with their hexadecimal value,
719/// e.g. "_x0020_" decodes as " "
720/// This override modifies and returns a copy of the input string.
721PWIZ_API_DECL std::string decode_xml_id_copy(const std::string& str);
722
723
724} // namespace minimxml
725} // namespace pwiz
726
727
728#endif // _SAXPARSER_HPP_
729
730
T defaultValue()
#define PWIZ_API_DECL
Definition Export.hpp:32
void set(const char *_name, char *_value, bool _needsUnescape)
std::string getValue(XMLUnescapeBehavior_t Unescape=XMLUnescapeDefault) const
T valueAs(XMLUnescapeBehavior_t Unescape) const
const char * getValuePtr(XMLUnescapeBehavior_t Unescape=XMLUnescapeDefault) const
size_t valueAs(XMLUnescapeBehavior_t Unescape) const
Attributes(const char *_source_text, size_t _source_text_len, bool _autoUnescape)
attribute_list::const_iterator begin() const
attribute_list::const_iterator end() const
attribute_list::const_iterator find(const std::string &name) const
Attributes & operator=(const Attributes &rhs)
Attributes(saxstring &str, bool _autoUnescape)
const char * findValueByName(const char *name, XMLUnescapeBehavior_t Unescape=XMLUnescapeDefault) const
const attribute * findAttributeByName(const char *name) const
PWIZ_API_DECL void parseAttributes(std::string::size_type &index) const
SAX event handler interface.
bool parseCharacters
When false, no calls to characters() will be made.
T & getAttribute(const Attributes &attributes, const char *name, T &result) const
virtual Status processingInstruction(const std::string &name, const std::string &data, stream_offset position)
virtual Status characters(const SAXParser::saxstring &text, stream_offset position)
bool autoUnescapeAttributes
Setting these to false will disable the auto-unescaping feature of the parser; this is useful for han...
std::string & getAttribute(const Attributes &attributes, const char *name, std::string &result) const
T & getAttribute(const Attributes &attributes, const char *name, T &result, XMLUnescapeBehavior_t Unescape, T defaultValue=T()) const
virtual Status endElement(const std::string &name, stream_offset position)
T & getAttribute(const Attributes &attributes, const std::string &name, T &result, T defaultValue=T()) const
virtual Status startElement(const std::string &name, const Attributes &attributes, stream_offset position)
const char * getAttribute(const Attributes &attributes, const char *name, XMLUnescapeBehavior_t Unescape, const char *defaultValue=NULL) const
int version
contextual version available to control handler logic which support multiple versions of a schema; th...
boost::iostreams::stream_offset stream_offset
saxstring & operator=(const SAXParser::saxstring &rhs)
Definition SAXParser.hpp:97
bool ends_with(const char *txt) const
saxstring(const std::string &rhs)
Definition SAXParser.hpp:80
saxstring & operator+=(const SAXParser::saxstring &rhs)
saxstring(const SAXParser::saxstring &rhs)
Definition SAXParser.hpp:76
bool starts_with(const char *txt) const
bool operator==(const char *c) const
PWIZ_API_DECL void parse(std::istream &is, Handler &handler)
Extract a single XML element from the istream, sending SAX events to the handler.
PWIZ_API_DECL size_t count_trail_ws(const char *data, size_t len)
std::ostream & operator<<(std::ostream &os, const saxstring &s)
Target textToValue(const char *txt)
PWIZ_API_DECL void unescapeXML(char *str)
bool istrue(const char *t)
PWIZ_API_DECL std::string decode_xml_id_copy(const std::string &str)
Decodes any characters encoded with their hexadecimal value, e.g.
PWIZ_API_DECL std::string xml_root_element(const std::string &fileheader)
Returns the root element from an XML buffer; throws runtime_error if no element is found.
PWIZ_API_DECL std::string xml_root_element_from_file(const std::string &filepath)
Returns the root element from an XML file; throws runtime_error if no element is found.
PWIZ_API_DECL std::string & decode_xml_id(std::string &str)
Decodes any characters encoded with their hexadecimal value, e.g.
#define ATOF(x)
Handler returns the Status struct as a means of changing the parser's behavior.
Status(Flag _flag=Ok, Handler *_delegate=0)