Skip to content

Invalid attributes producing extra null characters during DTD validation  #127

@ChrisTrenkamp

Description

@ChrisTrenkamp

Use the catalog here to reproduce this issue: #104 (comment)

test.xml

<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE concept PUBLIC "-//OASIS//DTD DITA Concept//EN" "concept.dtd">
<concept id="foo">
   <title>foo</title>
   <conbody>
      <note type="note" verbose="yes">
         <p>The verbose attribute is invalid, and it causes wstx to produce a
         null character when writing the XML.</p>
      </note>
   </conbody>
</concept>

WstxValidatorTest.java

import com.ctc.wstx.stax.WstxInputFactory;
import com.ctc.wstx.stax.WstxOutputFactory;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.nio.file.Paths;
import javax.xml.catalog.CatalogFeatures;
import javax.xml.catalog.CatalogManager;
import javax.xml.stream.XMLInputFactory;

public class WstxValidatorTest {
  public static void main(String[] args) throws Exception {
    var catalog =
        CatalogManager.catalogResolver(
            CatalogFeatures.defaults(), Paths.get("path/to/catalog.xml").toUri());
    var factory = new WstxInputFactory();
    factory.setProperty(XMLInputFactory.IS_VALIDATING, true);
    factory.setProperty(XMLInputFactory.RESOLVER, catalog);

    var streamReader = factory.createXMLStreamReader(new File("path/to/test.xml"));
    streamReader.setValidationProblemHandler(p -> System.err.println(p.getMessage()));
    var reader = factory.createXMLEventReader(streamReader);
    var buffer = new ByteArrayOutputStream();
    var writer = new WstxOutputFactory().createXMLEventWriter(buffer);

    while (reader.hasNext()) {
      var event = reader.nextEvent();

      if (event.isStartElement()) {
        var attrs = event.asStartElement().getAttributes();

        while (attrs.hasNext()) {
          var attr = attrs.next();

          System.out.println(
              attr.getName()
                  + " = '"
                  + attr.getValue()
                  + "' ( "
                  + hex(attr.getValue().getBytes())
                  + ")");
        }
      }
      writer.add(event);
    }

    var xmlResult = new String(buffer.toByteArray(), "UTF-8");
    System.out.println(xmlResult);
  }

  static String hex(byte[] bytes) {
    var buffer = new StringBuilder();

    for (var i : bytes) {
      buffer.append(String.format("0x%02x ", i));
    }

    return buffer.toString();
  }
}

Output:

...
Element <note> has no attribute "verbose"
type = 'note' ( 0x6e 0x6f 0x74 0x65 )
verbose = 'yes    ' ( 0x79 0x65 0x73 0x00 0x00 0x00 0x00 )
class = '- topic/note ' ( 0x2d 0x20 0x74 0x6f 0x70 0x69 0x63 0x2f 0x6e 0x6f 0x74 0x65 0x20 )
Exception in thread "main" com.ctc.wstx.exc.WstxIOException: Invalid null character in text to output
	at com.ctc.wstx.sw.BaseNsStreamWriter.doWriteAttr(BaseNsStreamWriter.java:531)
	at com.ctc.wstx.sw.SimpleNsStreamWriter.writeAttribute(SimpleNsStreamWriter.java:90)
	at org.codehaus.stax2.ri.Stax2EventWriterImpl.add(Stax2EventWriterImpl.java:61)
	at org.codehaus.stax2.ri.Stax2EventWriterImpl.add(Stax2EventWriterImpl.java:108)
	at WstxValidatorTest.main(WstxValidatorTest.java:46)
Caused by: java.io.IOException: Invalid null character in text to output
	at com.ctc.wstx.api.InvalidCharHandler$FailingHandler.convertInvalidChar(InvalidCharHandler.java:52)
	at com.ctc.wstx.sw.XmlWriter.handleInvalidChar(XmlWriter.java:628)
	at com.ctc.wstx.sw.BufferingXmlWriter.writeAttrValue(BufferingXmlWriter.java:1082)
	at com.ctc.wstx.sw.BufferingXmlWriter.writeAttribute(BufferingXmlWriter.java:925)
	at com.ctc.wstx.sw.BaseNsStreamWriter.doWriteAttr(BaseNsStreamWriter.java:528)
	... 4 more

Notice the output for the verbose attribute. The attribute value is being padded with extra null characters.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions