001    /*
002     * Copyright 2005 John G. Wilson
003     *
004     * Licensed under the Apache License, Version 2.0 (the "License");
005     * you may not use this file except in compliance with the License.
006     * You may obtain a copy of the License at
007     *
008     *     http://www.apache.org/licenses/LICENSE-2.0
009     *
010     * Unless required by applicable law or agreed to in writing, software
011     * distributed under the License is distributed on an "AS IS" BASIS,
012     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013     * See the License for the specific language governing permissions and
014     * limitations under the License.
015     *
016     */
017    
018    package groovy.util;
019    
020    import groovy.util.slurpersupport.GPathResult;
021    import groovy.util.slurpersupport.Node;
022    import groovy.util.slurpersupport.NodeChild;
023    
024    import java.io.File;
025    import java.io.FileInputStream;
026    import java.io.IOException;
027    import java.io.InputStream;
028    import java.io.Reader;
029    import java.io.StringReader;
030    import java.net.URL;
031    import java.security.AccessController;
032    import java.security.PrivilegedActionException;
033    import java.security.PrivilegedExceptionAction;
034    import java.util.HashMap;
035    import java.util.Hashtable;
036    import java.util.Map;
037    import java.util.Stack;
038    
039    import javax.xml.parsers.ParserConfigurationException;
040    import javax.xml.parsers.SAXParser;
041    import javax.xml.parsers.SAXParserFactory;
042    
043    import org.xml.sax.Attributes;
044    import org.xml.sax.DTDHandler;
045    import org.xml.sax.EntityResolver;
046    import org.xml.sax.ErrorHandler;
047    import org.xml.sax.InputSource;
048    import org.xml.sax.SAXException;
049    import org.xml.sax.SAXNotRecognizedException;
050    import org.xml.sax.SAXNotSupportedException;
051    import org.xml.sax.XMLReader;
052    import org.xml.sax.helpers.DefaultHandler;
053    
054    /**
055     * @author John Wilson
056     *
057     */
058    
059    public class XmlSlurper extends DefaultHandler {
060      private final XMLReader reader;
061      private Node currentNode = null;
062      private final Stack stack = new Stack();
063      private final StringBuffer charBuffer = new StringBuffer();
064      private final Map namespaceTagHints = new Hashtable();
065    
066      public XmlSlurper() throws ParserConfigurationException, SAXException {
067        this(false, true);
068      }
069      
070      public XmlSlurper(final boolean validating, final boolean namespaceAware) throws ParserConfigurationException, SAXException {
071      SAXParserFactory factory = null;
072        
073        try {
074          factory = (SAXParserFactory) AccessController.doPrivileged(new PrivilegedExceptionAction() {
075            public Object run() throws ParserConfigurationException {
076              return SAXParserFactory.newInstance();
077            }
078          });
079        } catch (final PrivilegedActionException pae) {
080          final Exception e = pae.getException();
081          
082          if (e instanceof ParserConfigurationException) {
083            throw (ParserConfigurationException) e;
084          } else {
085            throw new RuntimeException(e);
086          }
087        }
088        factory.setNamespaceAware(namespaceAware);
089        factory.setValidating(validating);
090        
091        final SAXParser parser = factory.newSAXParser();
092        this.reader = parser.getXMLReader();
093      }
094      
095      public XmlSlurper(final XMLReader reader) {
096        this.reader = reader;
097      }
098      
099      public XmlSlurper(final SAXParser parser) throws SAXException {
100        this(parser.getXMLReader());
101      }
102      
103      /**
104       * @return The GPathResult instance created by consuming a stream of SAX events
105       * Note if one of the parse methods has been called then this returns null
106       * Note if this is called more than once all calls after the first will return null
107       *
108       */
109      public GPathResult getDocument() {
110        try {
111          return new NodeChild(this.currentNode, null, this.namespaceTagHints);
112        } finally {
113          this.currentNode = null;
114        }
115      }
116      
117      /**
118       * Parse the content of the specified input source into a GPathResult object
119       * 
120       * @param input
121       * @return An object which supports GPath expressions
122       * @throws IOException
123       * @throws SAXException
124       */
125      public GPathResult parse(final InputSource input) throws IOException, SAXException {
126        this.reader.setContentHandler(this);
127        this.reader.parse(input);
128        
129        return getDocument();
130        
131      }
132      
133      /**
134       * Parses the content of the given file as XML turning it into a GPathResult object
135       * 
136       * @param file
137       * @return An object which supports GPath expressions
138       * @throws IOException
139       * @throws SAXException
140       */
141      public GPathResult parse(final File file) throws IOException, SAXException {
142      final InputSource input = new InputSource(new FileInputStream(file));
143        
144        input.setSystemId("file://" + file.getAbsolutePath());
145        
146        return parse(input);
147        
148      }
149      
150      /**
151       * Parse the content of the specified input stream into an GPathResult Object.
152       * Note that using this method will not provide the parser with any URI
153       * for which to find DTDs etc
154       * 
155       * @param input
156       * @return An object which supports GPath expressions
157       * @throws IOException
158       * @throws SAXException
159       */
160      public GPathResult parse(final InputStream input) throws IOException, SAXException {
161        return parse(new InputSource(input));
162      }
163      
164      /**
165       * Parse the content of the specified reader into a GPathResult Object.
166       * Note that using this method will not provide the parser with any URI
167       * for which to find DTDs etc
168       * 
169       * @param in
170       * @return An object which supports GPath expressions
171       * @throws IOException
172       * @throws SAXException
173       */
174      public GPathResult parse(final Reader in) throws IOException, SAXException {
175        return parse(new InputSource(in));
176      }
177      
178      /**
179       * Parse the content of the specified URI into a GPathResult Object
180       * 
181       * @param uri
182       * @return An object which supports GPath expressions
183       * @throws IOException
184       * @throws SAXException
185       */
186      public GPathResult parse(final String uri) throws IOException, SAXException {
187        return parse(new InputSource(uri));
188      }
189      
190      /**
191       * A helper method to parse the given text as XML
192       * 
193       * @param text
194       * @return An object which supports GPath expressions
195       */
196      public GPathResult parseText(final String text) throws IOException, SAXException {
197        return parse(new StringReader(text));
198      }
199      
200      // Delegated XMLReader methods
201      //------------------------------------------------------------------------
202    
203      /* (non-Javadoc)
204       * @see org.xml.sax.XMLReader#getDTDHandler()
205       */
206      public DTDHandler getDTDHandler() {
207          return this.reader.getDTDHandler();
208      }
209    
210      /* (non-Javadoc)
211       * @see org.xml.sax.XMLReader#getEntityResolver()
212       */
213      public EntityResolver getEntityResolver() {
214          return this.reader.getEntityResolver();
215      }
216    
217      /* (non-Javadoc)
218       * @see org.xml.sax.XMLReader#getErrorHandler()
219       */
220      public ErrorHandler getErrorHandler() {
221          return this.reader.getErrorHandler();
222      }
223    
224      /* (non-Javadoc)
225       * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
226       */
227      public boolean getFeature(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
228          return this.reader.getFeature(uri);
229      }
230    
231      /* (non-Javadoc)
232       * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
233       */
234      public Object getProperty(final String uri) throws SAXNotRecognizedException, SAXNotSupportedException {
235          return this.reader.getProperty(uri);
236      }
237    
238      /* (non-Javadoc)
239       * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
240       */
241      public void setDTDHandler(final DTDHandler dtdHandler) {
242          this.reader.setDTDHandler(dtdHandler);
243      }
244    
245      /* (non-Javadoc)
246       * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
247       */
248      public void setEntityResolver(final EntityResolver entityResolver) {
249          this.reader.setEntityResolver(entityResolver);
250      }
251    
252      /**
253       * Resolves entities against using the suppied URL as the base for relative URLs
254       * 
255       * @param base
256       * The URL used to resolve relative URLs
257       */
258      public void setEntityBaseUrl(final URL base) {
259          this.reader.setEntityResolver(new EntityResolver() {
260              public InputSource resolveEntity(final String publicId, final String systemId) throws IOException {
261                  return new InputSource(new URL(base, systemId).openStream());
262              }
263          });
264      }
265    
266      /* (non-Javadoc)
267       * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
268       */
269      public void setErrorHandler(final ErrorHandler errorHandler) {
270          this.reader.setErrorHandler(errorHandler);
271      }
272    
273      /* (non-Javadoc)
274       * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
275       */
276      public void setFeature(final String uri, final boolean value) throws SAXNotRecognizedException, SAXNotSupportedException {
277          this.reader.setFeature(uri, value);
278      }
279    
280      /* (non-Javadoc)
281       * @see org.xml.sax.XMLReader#setProperty(java.lang.String, java.lang.Object)
282       */
283      public void setProperty(final String uri, final Object value) throws SAXNotRecognizedException, SAXNotSupportedException {
284           this.reader.setProperty(uri, value);
285      }
286      
287      
288      // ContentHandler interface
289      //-------------------------------------------------------------------------                    
290      
291      /* (non-Javadoc)
292       * @see org.xml.sax.ContentHandler#startDocument()
293       */
294      public void startDocument() throws SAXException {
295        this.currentNode = null;
296        this.charBuffer.setLength(0);
297      }
298      
299      /* (non-Javadoc)
300       * @see org.xml.sax.helpers.DefaultHandler#startPrefixMapping(java.lang.String, java.lang.String)
301       */
302      public void startPrefixMapping(final String tag, final String uri) throws SAXException {
303        this.namespaceTagHints.put(tag, uri);
304      }
305    
306      /* (non-Javadoc)
307       * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
308       */
309      public void startElement(final String namespaceURI, final String localName, final String qName, final Attributes atts) throws SAXException {
310        addNonWhitespaceCdata();
311        
312        final Map attributes = new HashMap();
313        final Map attributeNamespaces = new HashMap();
314        
315        for (int i = atts.getLength() - 1; i != -1; i--) {
316          if (atts.getURI(i).length() == 0) {
317            attributes.put(atts.getQName(i), atts.getValue(i));
318          } else {
319            attributes.put(atts.getLocalName(i), atts.getValue(i));
320            attributeNamespaces.put(atts.getLocalName(i), atts.getURI(i));
321          }
322          
323        }
324        
325        final Node newElement;
326        
327        if (namespaceURI.length() == 0){
328          newElement = new Node(this.currentNode, qName, attributes, attributeNamespaces, namespaceURI);
329        } else {
330          newElement = new Node(this.currentNode, localName, attributes, attributeNamespaces, namespaceURI);
331        }
332        
333        if (this.currentNode != null) {
334          this.currentNode.addChild(newElement);
335        }
336        
337        this.stack.push(this.currentNode);
338        this.currentNode = newElement;
339      }
340      
341      /* (non-Javadoc)
342       * @see org.xml.sax.ContentHandler#characters(char[], int, int)
343       */
344      public void characters(final char[] ch, final int start, final int length) throws SAXException {
345        this.charBuffer.append(ch, start, length);
346      }
347      
348      /* (non-Javadoc)
349       * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
350       */
351      public void endElement(final String namespaceURI, final String localName, final String qName) throws SAXException {
352        addNonWhitespaceCdata();
353        
354        final Object oldCurrentNode = this.stack.pop();
355        
356        if (oldCurrentNode != null) {
357          this.currentNode = (Node)oldCurrentNode;
358        }
359      }
360      
361      /* (non-Javadoc)
362       * @see org.xml.sax.ContentHandler#endDocument()
363       */
364      public void endDocument() throws SAXException {
365      }
366      
367      // Implementation methods
368      //-------------------------------------------------------------------------           
369      
370      /**
371       * 
372       */
373      private void addNonWhitespaceCdata() {
374        if (this.charBuffer.length() != 0) {
375          //
376          // This element is preceeded by CDATA if it's not whitespace add it to the body
377          // Note that, according to the XML spec, we should preserve the CDATA if it's all whitespace
378          // but for the sort of work I'm doing ignoring the whitespace is preferable
379          //
380          final String cdata = this.charBuffer.toString();
381          
382          this.charBuffer.setLength(0);
383          if (cdata.trim().length() != 0) {
384            this.currentNode.addChild(cdata);
385          }
386        }   
387      }
388    }