Source code for sdv.utils

# Copyright (c) 2015, The MITRE Corporation. All rights reserved.
# See LICENSE.txt for complete terms.

# builtin
import os
import contextlib
import datetime
from distutils.version import StrictVersion

# external
import dateutil.parser
from lxml import etree

# relative
from . import errors, xmlconst


_XML_PARSER = None


@contextlib.contextmanager
[docs]def ignored(*exceptions): """Allows you to ignore exceptions cleanly using context managers. This exists in Python 3.4 as ``contextlib.suppress()``. """ try: yield except exceptions: pass
[docs]def get_xml_parser(encoding=None): """Returns the global XML parser object. If no global XML parser has been set, one will be created and then returned. Args: encoding: The expected encoding of input documents. By default, an attempt will be made to determine the input document encoding. Return: The global XML parser object. """ global _XML_PARSER if not _XML_PARSER: _XML_PARSER = etree.ETCompatXMLParser( attribute_defaults=False, load_dtd=False, huge_tree=False, no_network=True, ns_clean=True, recover=False, remove_pis=False, remove_blank_text=False, remove_comments=False, resolve_entities=False, strip_cdata=True, encoding=encoding ) return _XML_PARSER
[docs]def set_xml_parser(parser): """Set the XML parser to use internally. This should be an instance of ``lxml.etree.XMLParser``. Note: Setting `parser` to an object that is not an instance ``lxml.etree.XMLParser`` may result in undesired behaviors. Args: parser: An etree parser. """ global _XML_PARSER _XML_PARSER = parser
[docs]def get_etree_root(doc): """Returns an instance of lxml.etree._Element for the given `doc` input. Args: doc: The input XML document. Can be an instance of ``lxml.etree._Element``, ``lxml.etree._ElementTree``, a file-like object, or a string filename. Returns: An ``lxml.etree._Element`` instance for `doc`. Raises: .ValidationError: If `doc` cannot be found or is not a well-formed XML document. """ try: if isinstance(doc, etree._Element): # noqa root = doc elif isinstance(doc, etree._ElementTree): # noqa root = doc.getroot() else: parser = get_xml_parser() tree = etree.parse(doc, parser=parser) root = tree.getroot() except Exception as ex: raise errors.ValidationError(str(ex)) return root
[docs]def get_target_ns(doc): """Returns the value of the ``targetNamespace`` attribute found on `doc`. Returns: The value of the ``targetNamespace`` attribute found at the root of `doc`. Raises: KeyError: If `doc` does not contain a ``targetNamespace`` attribute. .ValidationError: If `doc` cannot be found or is not a well-formed XML document. """ root = get_etree_root(doc) return root.attrib['targetNamespace']
[docs]def get_schemaloc_pairs(node): """Parses the xsi:schemaLocation attribute on `node`. Returns: A list of (ns, schemaLocation) tuples for the node. Raises: KeyError: If `node` does not have an xsi:schemaLocation attribute. """ schemalocs = node.attrib[xmlconst.TAG_SCHEMALOCATION] l = schemalocs.split() pairs = zip(l[::2], l[1::2]) return pairs
[docs]def list_xml_files(directory, recursive=False): """Returns a list of file paths for XML files contained within `dir_`. Args: dir_: A path to a directory. recursive: If ``True``, this function will descend into all subdirectories. Returns: A list of XML file paths directly under `dir_`. """ files, dirs = [], [] for fn in os.listdir(directory): fp = os.path.join(directory, fn) if fn.endswith('.xml'): files.append(fp) elif os.path.isdir(fp): dirs.append(fp) else: continue if recursive and dirs: files.extend(get_xml_files(dirs, recursive)) return files
[docs]def get_xml_files(files, recursive=False): """Returns a list of files to validate from `files`. If a member of `files` is a directory, its children with a ``.xml`` extension will be added to the return value. Args: files: A list of file paths and/or directory paths. recursive: If ``true``, this will descend into any subdirectories of input directories. Returns: A list of file paths to validate. """ if not files: return [] xml_files = [] for fn in files: if os.path.isdir(fn): children = list_xml_files(fn, recursive) xml_files.extend(children) else: xml_files.append(fn) return xml_files
[docs]def get_type_ns(doc, typename): """Returns the namespace associated with the ``xsi:type`` `typename` found in the XML document `doc`. Args: doc: An XML document. This can be a filename, file-like object, ``etree._Element``, or ``etree._ElementTree`` instance. typename: The ``xsi:type`` value for a given vocabulary instance. """ root = get_etree_root(doc) prefix = typename.split(':')[0] try: return root.nsmap[prefix] except KeyError: raise errors.ValidationError( "xsi:type '%s' contains unresolvable namespace prefix." % typename )
[docs]def get_namespace(node): """Returns the namespace for which `node` falls under. Args: node: An etree node. """ qname = etree.QName(node) return qname.namespace
[docs]def is_stix(doc): """Attempts to determine if the input `doc` is a STIX XML instance document. If the root-level element falls under a namespace which starts with ``http://stix.mitre.org``, this will return True. """ root = get_etree_root(doc) namespace = get_namespace(root) return namespace.startswith("http://stix.mitre.org")
[docs]def is_cybox(doc): """Attempts to determine if the input `doc` is a CybOX XML instance document. If the root-level element falls under a namespace which starts with ``http://cybox.mitre.org``, this will return True. """ root = get_etree_root(doc) namespace = get_namespace(root) return namespace.startswith("http://cybox.mitre.org")
[docs]def is_version_equal(x, y): """Attempts to determine if the `x` amd `y` version numbers are semantically equivalent. Examples: The version strings "2.1.0" and "2.1" represent semantically equivalent versions, despite not being equal strings. Args: x: A string version number. Ex: '2.1.0' y: A string version number. Ex: '2.1' """ return StrictVersion(x) == StrictVersion(y)
[docs]def parse_timestamp(value): """Attempts to parse `value` into an instance of ``datetime.datetime``. If `value` is ``None``, this function will return ``None``. Args: value: A timestamp. This can be a string or datetime.datetime value. """ if not value: return None elif isinstance(value, datetime.datetime): return value return dateutil.parser.parse(value)
[docs]def has_tzinfo(timestamp): """Returns ``True`` if the `timestamp` includes timezone or UTC offset information. """ ts = parse_timestamp(timestamp) return ts and bool(ts.tzinfo)
[docs]def strip_whitespace(string): """Returns a copy of `string` with all whitespace removed. """ if string is None: return None return ''.join(string.split())
[docs]def has_content(node): """Returns ``True`` if the `node` has children or text nodes. Note: This will ignore whitespace and XML comments. """ if node is None: return False if len(node.findall('*')) > 0: return True stripped = strip_whitespace(node.text) return bool(stripped)