#! /usr/bin/perl use strict; use XML::DOM; use XML::Writer; use Getopt::Long; if($#ARGV < 0) { useage(); exit(); } my $DOING_SUBS = 0; my $level = 0; my $DEBUG = 0; my $GUESS_DATA_TYPE = 0; my $SEGREGATE_DUPLICATE_TAGS_TO_ELEMENT_REFS = 0; my $DIRECTORY_PARSE= 0; my $commonSchema = "commonTypes.xsd"; my $inputFile; my $outputFile; my $generationDir; my $help; my $version; my $folder; GetOptions ('i=s' => \$inputFile, 'input=s' => \$inputFile, 'o=s' => \$outputFile, 'output=s' => \$outputFile, 'f=s' => \$folder, 'folder=s' => \$folder, 'h' => \$help, 'help' => \$help, 'v' => \$version, 'version' => \$version, 'r' => \$SEGREGATE_DUPLICATE_TAGS_TO_ELEMENT_REFS, 'refs' => \$SEGREGATE_DUPLICATE_TAGS_TO_ELEMENT_REFS, 'g' => \$GUESS_DATA_TYPE, 'guess' => \$GUESS_DATA_TYPE, 'l=s' => \$generationDir, 'location=s' => \$generationDir, 'c=s'=> \$commonSchema, 'common=s' => \$commonSchema, 'd' => \$DEBUG, 'debug' => \$DEBUG); if($help) { useage(); exit(); } if($version) { print "generateXSD version 1.2\n"; exit(); } # Constant XML variables. my $DUMMY_ROOT = "dummyRoot"; my $TEMP_FILE = "temp_GenerateXSD.pl.xml"; my $SCHEMA = "schema"; my $ANNOTATION = "annotation"; my $DOCUMENTATION = "documentation"; my $NS = "http://www.w3.org/2001/XMLSchema"; my $ELEMENT = "element"; my $COMPLEX_TYPE = "complexType"; my $SEQUENCE = "sequence"; my $SIMPLE_CONTENT = "simpleContent"; my $EXTENSION = "extension"; my $BASE = "base"; my $TYPE = "type"; my $NAME = "name"; my $REF = "ref"; my $INCLUDE = "include"; my $SCHEMA_LOCATION = "schemaLocation"; my $ELEMENT_FORM_DEFAULT = "elementFormDefault"; my $QUALIFIED = "qualified"; my $UTF8 = "UTF-8"; my $MIN_OCCURS = "minOccurs"; my $UNBOUNDED = "unbounded"; my $MAX_OCCURS = "maxOccurs"; my $ATTRIBUTE = "attribute"; my $DEFAULT_DOCUMENTATION = "DOCUMENT ME!"; my $USE = "use"; my $OPTIONAL = "optional"; my $REQUIRED = "required"; my $STRING = "xs:string"; my $FLOAT = "xs:float"; my $INT = "xs:int"; my $LONG = "xs:long"; my $DATE = "xs:date"; my %markedAttributes; my %markedNodes; my %usedTags; my %rootTags; my %guessedDataTypes; my %duplicatedNodes; my $haveSetWriterToNewFile = 0; my $previousWriter; my $previousOutput; my $withinSubTag = 0; my $previousElement; my $parser = new XML::DOM::Parser; # If the user has specified a directory of XML models to parse. if($generationDir) { $DIRECTORY_PARSE = 1; $SEGREGATE_DUPLICATE_TAGS_TO_ELEMENT_REFS = 1; $inputFile = $TEMP_FILE; $outputFile = $commonSchema; my $filesExist = concatFiles($generationDir); if(!$filesExist) { useage(); print "Error: There appear to be no valid XML model files in directory \"$generationDir\".\n\n"; exit(); } } # Exit if there's no input file specified. if(!$inputFile) { useage(); exit(); } # No output file is not a fatal error. if(!$outputFile) { $outputFile = $inputFile; $outputFile .= ".xsd"; } # Determine wether the input file is empty or non-existant. my $inputContents; open (INP, "<$inputFile"); undef $/; my $inputContents = ; if(!$inputContents) { useage(); print "Error: Input file \"$inputFile\" does not exist or is empty.\n\n"; exit(0); } # Parse the input file into our XML document. my $doc = $parser->parsestring ($inputContents); if($folder) { my $folderExists = stat($folder); if(!$folderExists) { system('mkdir '. $folder); } $outputFile = "$folder/$outputFile"; } my $output = new IO::File(">$outputFile"); my $writer = new XML::Writer(NAMESPACES => 1, PREFIX_MAP => {$NS => 'xs'}, OUTPUT => $output); # Exit if there are no nodes in the document. my $root = $doc->getDocumentElement(); if(!defined($root) || $root->getChildNodes()->getLength() == 0) { useage(); print "Error: Input file does not appear to be a valid XML document.\n\n"; exit(0); } if($DEBUG) { print "Using Option: \"debug\".\n"; } # Process a tree of all possible elements and attributes populateDuplicateElementTree($root); if($GUESS_DATA_TYPE && $DEBUG) { print "Using Option: \"Guess data types\".\n" if $DEBUG; } # Create the document header. addHeader($root); # Recurse through all elements starting at the root. addElement($root); # Add document footer and close our writers. addFooter($root); print "Creating file: \"$outputFile\"\n" if $DEBUG; sub useage { print < node. -g --guess Attempt to guess the data type of elements and attributes. Multiple file generation: -l=DIR --location=DIR Generate all files in DIR. (Assumes -r) any commonly used elements will be generated to a file called "commonTypes.xsd" by default. -f --folder=DIR Generate files from specified location to DIR. -c=FILE --common=FILE Specify the filename for the commonly used elements. Logging: -d --debug Print debugging output. END } # Recursively add an element and any sub elements. # If there are sub elements, add them as a sequence of # elements under a complexType. sub addElement { my ($rootElem) = (@_); my $ray = $markedNodes{$rootElem->getTagName()}; # If we have a reference to the node in our cache, # then it will contain the most information present about a node # as opposed to partial information which a node with optional # sub-elements may provide. # Wherever possible use the cache. if($ray) { foreach my $elem (@$ray) { if ($elem->getNodeType == ELEMENT_NODE) { if($elem->getChildNodes()->getLength() == 0) { addSimpleElement($elem); } else { # We generally disallow repeating elements within the unique path. # This allows the current node as we're passing a reference to # a cached node, not the one from the current element. $duplicatedNodes{$elem->getTagName()} = "1"; addComplexType($elem); } } } } else { foreach my $elem (reverse sort $rootElem->getChildNodes()) { if ($elem->getNodeType == ELEMENT_NODE) { my $subElements = $markedNodes{$elem->getTagName()}; if($subElements && length(@{$subElements}) == 0) { addSimpleElement($elem); } else { addComplexType($elem); } } } } } # If we're parsing through a directory, # we will write several XSD files, and then a common types file. # this switches to a new file when appropriate. sub switchToNewWriter { my ($element) = (@_); if($DIRECTORY_PARSE && $level == 0) { if($rootTags{$element->getTagName()}) { if($withinSubTag) { addFooter($previousElement); } if(!$haveSetWriterToNewFile) { $previousOutput = $output; $previousWriter = $writer; $haveSetWriterToNewFile = 1; } $previousElement = $element; $withinSubTag = 1; # Create a new file from the element. my $newFile = $rootTags{$element->getTagName()}; if($folder) { $newFile = "$folder/$newFile"; } print "Creating file: \"$newFile\"\n" if $DEBUG; $output = new IO::File(">$newFile"); $writer = new XML::Writer(NAMESPACES => 1, PREFIX_MAP => {$NS => 'xs'}, OUTPUT => $output); addHeader($element); } } } # Add a single element underneath a complexType/sequence. # Recurse through the nodes and add any sub elements. # If we're segregating duplicate elements to element refs # then add a reference tag and don't recurse further. sub addComplexType { my ($element) = (@_); # Switch to a new file if we're parsing through directories. switchToNewWriter($element); my $tagName = $element->getTagName(); my $uniqueKey = getUniqueKey($element); if(!$usedTags{$uniqueKey} || $DOING_SUBS) { if($markedNodes{$tagName} && $SEGREGATE_DUPLICATE_TAGS_TO_ELEMENT_REFS) { if(!$rootTags{$element->getTagName()}) { $writer->emptyTag([$NS, $ELEMENT], $REF => $tagName, $MIN_OCCURS => '0', $MAX_OCCURS => $UNBOUNDED); } } else { if($rootTags{$element->getTagName()}) { $level++; addElement($element); $level--; } else { my $hasElements = 0; # Make sure it's necessary to add a sequence of elements. # at the lowest level of granularity an element will have # a TEXT_NODE only. foreach my $subl($element->getChildNodes()) { if($subl->getNodeType == ELEMENT_NODE) { $hasElements = 1; last; } } if(!$hasElements && $element->getAttributes()->getLength() == 0) { $writer->emptyTag([$NS, $ELEMENT], $NAME => $tagName, $TYPE => guessDataType($element->getTagName()), $MIN_OCCURS => '0', $MAX_OCCURS => $UNBOUNDED); } else { $writer->startTag([$NS, $ELEMENT], $NAME => $tagName, $MIN_OCCURS => '0', $MAX_OCCURS => $UNBOUNDED); $writer->startTag([$NS,$COMPLEX_TYPE]); $writer->startTag([$NS,$SEQUENCE]); $level++; # At this point we should process the element with the # most amount of information from our model. addElement($element); $level--; $writer->endTag([$NS,$SEQUENCE]); # Add any element nodes. my $attributes = $element->getAttributes(); for(my $i=0;$i<$attributes->getLength();$i++) { addSimpleAttribute($element->getTagName(), $attributes->item($i)->getName()); } $writer->endTag([$NS,$COMPLEX_TYPE]); $writer->endTag([$NS,$ELEMENT]); } } } } if(!$duplicatedNodes{$element->getTagName()}) { $usedTags{getUniqueKey($element)} = "1"; } } # Attempt to guess what type of data the element / attribute contains. sub guessDataType { my ($elementName, $attributeName) = (@_); if(!$GUESS_DATA_TYPE) { return $STRING; } my $cachedTag = $guessedDataTypes{$elementName.$attributeName}; if($cachedTag) { return $cachedTag; } my $element; my $valueToCheck; my $useageType = $REQUIRED; my $stringTypePossible = 1; my $intTypePossible = 1; my $floatTypePossible = 1; my $dateTypePossible = 1; my $longTypePossible = 1; my $type = $STRING; my $hadAValue = 0; my $allElementTags = $root->getElementsByTagName($elementName,1); if($root->getTagName() eq $elementName) { $hadAValue = 1; $valueToCheck = $root->getAttribute($attributeName); if($valueToCheck) { if(dateTypePossible($valueToCheck)) { $type = $DATE; } elsif(floatTypePossible($valueToCheck)) { $type = $FLOAT; } elsif(longTypePossible($valueToCheck)) { $type = $LONG; } elsif(intTypePossible($valueToCheck)) { $type = $INT; } } $guessedDataTypes{$elementName.$attributeName} = $type; return $type; } for(my $i=0;$i<$allElementTags->getLength();$i++) { $element = $allElementTags->item($i); $valueToCheck = $element->getTagName(); if($attributeName) { $valueToCheck = $element->getAttribute($attributeName); if(!$valueToCheck) { $useageType = $OPTIONAL; } } else { $valueToCheck = $element->getNodeValue(); } if($valueToCheck) { $hadAValue = 1; if(!$dateTypePossible || !dateTypePossible($valueToCheck)) { $dateTypePossible = 0; } if (!$floatTypePossible || !floatTypePossible($valueToCheck)) { $floatTypePossible = 0; } if (!$intTypePossible || !intTypePossible($valueToCheck)) { $intTypePossible = 0; } if(!$longTypePossible || longTypePossible($valueToCheck)) { $longTypePossible = 0; } if(!$intTypePossible && !$floatTypePossible) { $floatTypePossible = 0; $intTypePossible = 0; last; } } } #Check. if($hadAValue) { if($floatTypePossible) { $type = $FLOAT; } elsif($intTypePossible) { $type = $INT; } elsif($longTypePossible) { $type = $LONG; } elsif($dateTypePossible) { $type = $DATE; } } $guessedDataTypes{$elementName.$attributeName} = $type; return $type; } sub dateTypePossible { my ($valueToCheck) = (@_); return $valueToCheck =~ m#^\d\d/\d\d/\d\d\s\d\d:\d\d$#; } sub floatTypePossible { my ($valueToCheck) = (@_); return ($valueToCheck =~/^([+-]?)(?=\d|\.\d)\d*(\.\d*)?([Ee]([+-]?\d+))?$/ && $valueToCheck =~ /\./); } sub longTypePossible { my ($valueToCheck) = (@_); return ($valueToCheck =~ /^-?\d+$/ && $valueToCheck > 65535); } sub intTypePossible { my ($valueToCheck) = (@_); return ($valueToCheck =~ /^-?\d+$/ && $valueToCheck <= 65535); } sub addSimpleElement { my ($element) = (@_); my $elementName = $element->getTagName(); if(!$usedTags{$elementName}) { if($markedNodes{$elementName} && $SEGREGATE_DUPLICATE_TAGS_TO_ELEMENT_REFS) { $writer->emptyTag([$NS, $ELEMENT], $REF => $elementName, $MIN_OCCURS => '0', $MAX_OCCURS => $UNBOUNDED); } else { my $attribs = $markedAttributes{$element->getTagName()}; if($element->getAttributes()->getLength() > 0) { collateAttributes($element->getTagName()); $attribs = $markedAttributes{$element->getTagName()}; } if($attribs && length(@{$attribs}) > 0) { # End of the line, make it optional, but not unbounded. $writer->startTag([$NS,$ELEMENT], $NAME => $elementName, $MIN_OCCURS => '0', $MAX_OCCURS => $UNBOUNDED); $writer->startTag([$NS,$COMPLEX_TYPE]); $writer->startTag([$NS,$SIMPLE_CONTENT]); addAttributes($element); $writer->endTag([$NS,$SIMPLE_CONTENT]); $writer->endTag([$NS,$COMPLEX_TYPE]); $writer->endTag([$NS,$ELEMENT]); } else { # End of the line, make it optional, but not unbounded. $writer->emptyTag([$NS,$ELEMENT], $NAME => $elementName, $TYPE => guessDataType($elementName), $MIN_OCCURS => '0', $MAX_OCCURS => $UNBOUNDED); } } $usedTags{getUniqueKey($element)} = "1"; } } # We want to only allow one element type within a # sequence. Get a unique key which relates to the # path from element's ancestors to itself. sub getUniqueKey { my ($element) = (@_); my $uniqueKey = $element->getTagName(); my $parent = $element->getParentNode(); while($parent && $parent->getNodeType() == ELEMENT_NODE) { $uniqueKey = $parent->getTagName() . $uniqueKey; $parent = $parent->getParentNode(); } return $uniqueKey; } sub addAttributes { my ($element) = (@_); my $attributes = $markedAttributes{$element->getTagName()}; if($attributes && length @{$attributes} > 0) { $writer->startTag([$NS,$EXTENSION], $BASE => $STRING); foreach my $attribute (@{$attributes}) { addSimpleAttribute($element->getTagName(), $attribute->getName()); } $writer->endTag([$NS,$EXTENSION]); } } sub addSimpleAttribute { my ($elementName, $attributeName) = (@_); my $type = guessDataType($elementName, $attributeName); $writer->emptyTag([$NS,$ATTRIBUTE], $NAME => $attributeName, $TYPE => $type, $USE => $OPTIONAL); } sub addHeader { my ($root) = (@_); $writer->xmlDecl($UTF8); $writer->startTag([$NS,$SCHEMA], $ELEMENT_FORM_DEFAULT => $QUALIFIED); if($DIRECTORY_PARSE && $withinSubTag) { $writer->emptyTag([$NS, $INCLUDE], $SCHEMA_LOCATION=>$commonSchema); } $writer->startTag([$NS,$ANNOTATION]); $writer->startTag([$NS,$DOCUMENTATION]); $writer->characters($DEFAULT_DOCUMENTATION); $writer->endTag([$NS,$DOCUMENTATION]); $writer->endTag([$NS,$ANNOTATION]); if($root->getTagName() ne $DUMMY_ROOT) { $writer->startTag([$NS,$ELEMENT], $NAME => $root->getTagName()); $writer->startTag([$NS,$COMPLEX_TYPE]); $writer->startTag([$NS,$SEQUENCE]); } } sub addFooter { my ($root) = (@_); # Revert back to our common types file if we're back at the # top of the stack and we need to process element references. if($DIRECTORY_PARSE && $haveSetWriterToNewFile && $level == 0 && !$rootTags{$root->getTagName()}) { addFooter($previousElement); $writer = $previousWriter; $output = $previousOutput; $withinSubTag = 0; } if($root->getTagName() ne $DUMMY_ROOT) { $writer->endTag([$NS,$SEQUENCE]); my $attributes = $root->getAttributes(); for(my $i=0;$i<$attributes->getLength();$i++) { addSimpleAttribute($root->getTagName(), $attributes->item($i)->getName()); } $writer->endTag([$NS,$COMPLEX_TYPE]); $writer->endTag([$NS,$ELEMENT]); } if(!$withinSubTag) { addSubElements(); } $writer->endTag([$NS,$SCHEMA]); $writer->end(); $output->close(); } sub addSubElements { $DOING_SUBS = 1; if($SEGREGATE_DUPLICATE_TAGS_TO_ELEMENT_REFS) { foreach my $tag (sort keys %markedNodes) { $writer->startTag([$NS, $ELEMENT], $NAME=>$tag); $writer->startTag([$NS, $COMPLEX_TYPE]); $writer->startTag([$NS, $SEQUENCE]); # Get our reference to array of sub-elements and process accordingly. my $elem = $markedNodes{$tag}; foreach my $element (@{$elem}) { my $refElem = $markedNodes{$element}; #If our sub term refers to another common type then #add a reference, otherwise add an element. if( $element->getChildNodes()->getLength() > 1) { addComplexType($element); } else { addSimpleElement($element); } } $writer->endTag([$NS, $SEQUENCE]); my $ele = $root->getElementsByTagName($tag)->item(0); addAttributes($ele); $writer->endTag([$NS, $COMPLEX_TYPE]); $writer->endTag([$NS, $ELEMENT]); } } $DOING_SUBS = 0; } sub populateDuplicateElementTree { my ($r) = (@_); foreach my $elem ($r->getChildNodes()) { if ($elem->getNodeType == ELEMENT_NODE ) { my $name = $elem->getTagName(); if(!$markedNodes{$name}) { my $common = $root->getElementsByTagName($name,1); # We have something which is repeated more than once in our XML model. # populate a hashtable with the elements which appear underneath it. if($common->getLength() > 1 && $common->item(0)->getChildNodes()->getLength() > 0) { populateElementSubElements($common); } } } if($elem->getChildNodes() > 1) { populateDuplicateElementTree($elem); } } } sub collateAttributes { my ($elementName) = (@_); my $attribsArray; my $commonElements = $root->getElementsByTagName($elementName); my $elem; my $attrib; my $attribs; for(my $i=0;$i<$commonElements->getLength;$i++) { $elem = $commonElements->item($i); $attribs = $elem->getAttributes(); for(my $a=0;$a<$attribs->getLength();$a++) { $attrib = $attribs->item($a); if(!containsAttrib($attrib, $attribsArray)) { push(@{$attribsArray}, $attrib); } } } if($attribsArray) { $markedAttributes{$elementName} = $attribsArray; } } sub populateElementSubElements { my ($common) = (@_); my $entry = $markedNodes{$common->item(0)->getTagName()}; my $currentSub; for(my $i=0;$i<$common->getLength();$i++) { my $element = $common->item($i); next unless ($element->getNodeType == ELEMENT_NODE); collateAttributes($element->getTagName); # Get any nodes which appear underneath. my $childNodes = $element->getChildNodes(); for(my $a=0;$a<$childNodes->getLength();$a++) { $currentSub = $childNodes->item($a); next unless ($currentSub->getNodeType() == ELEMENT_NODE); # Only add if the element is not already in the current list. if($currentSub->getTagName() ne "" && !contains($currentSub, $entry)) { # Use an array instead of a hash so that elements # appear in the XSD in the correct sequence - (As per the XML model) push(@{$entry}, $currentSub); } } } if($entry) { $markedNodes{$common->item(0)->getTagName()} = $entry; } } # Return true if an element is present in a list of Attributes. # Matches by getTagName(). sub contains { my ($tag,$array) = (@_); foreach my $t (@{$array}) { if($tag->getTagName() eq $t->getTagName()) { return 1; } } return 0; } sub containsAttrib { my ($tag,$array) = (@_); foreach my $t (@{$array}) { if($tag->getName() eq $t->getName()) { return 1; } } return 0; } sub concatFiles { my ($dir) = (@_); my $filesExist = 0; print "Processing Directory \"$dir\"\n" if $DEBUG; my $concatOutput = $TEMP_FILE; open (FH, ">$concatOutput"); undef $/; opendir(DIR, "$dir/"); print FH "<$DUMMY_ROOT>\n"; my $currentRootTag; my $file; while (defined($file = readdir(DIR))) { if($file =~ /.xml$/) { $filesExist = 1; print "Processing file: \"" . $file . "\"\n" if $DEBUG; open(RH, "$dir/$file"); my $fileContent = ; my $currentDocument = $parser->parsestring($fileContent); if($currentDocument) { $currentRootTag = $currentDocument->getDocumentElement()->getTagName(); } $file =~ s/\.xml/.xsd/; $rootTags{$currentRootTag} = $file; print FH $fileContent; close(RH); } } print FH "<\/$DUMMY_ROOT>"; closedir(DIR); close(FH); return $filesExist; }