Resolve #85: add PENMAN serialization

This is made available by `to_triples()` and `from_triples()` methods on Xmrs subclasses.
delph-in · Jan 18, 2017 · c11f82f · c11f82f
1 parent bfc876b
commit c11f82f
Show file tree

Hide file tree

Showing 7 changed files with 315 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,15 +4,25 @@
 
 This release replaces the top-level `pyDelphin` and `mrs.py` scripts with
 `delphin.sh` (when installed, a `delphin` command is made available that
-accomplishes the same thing).
+accomplishes the same thing). It also introduces the PENMAN codec for
+DMRS and EDS.
 
 ### Added
 
-* `delphin.sh` top-level script (replaces the former `pyDelphin` and `mrs.py`)
+* `delphin.sh` top-level script (replaces the former `pyDelphin` and
+  `mrs.py`)
 * `delphin.main` module for script usage
-* `mrs-json` and `dmrs-json` codecs for the `convert` script command
-  (currently only for Python 3, however)
+* `mrs-json`, `dmrs-json`, and `eds-json` codecs for the `convert`
+  script command
+* `dmrs-penman` and `eds-penman` codecs for the `convert` script command
 * Skeleton creation from sentence lists in the `mkprof` script command
+* `delphin.mrs.xmrs.Xmrs.from_xmrs()` (meant to be used by subclasses)
+* `delphin.mrs.xmrs.Dmrs.to_triples()`
+* `delphin.mrs.xmrs.Dmrs.to_triples()`
+* `delphin.mrs.eds.Eds.to_triples()`
+* `delphin.mrs.eds.Eds.from_triples()`
+* `delphin.mrs.penman` module for PENMAN serialization of DMRS and EDS
+  (resolves #85)
 
 ### Changed
 
@@ -25,6 +35,8 @@ accomplishes the same thing).
 * Quantifiers are detected more consistently for, e.g., DMRS conversion;
   this mostly resolves #84
 * DMRS `/EQ` links are now `MOD/EQ` and fix a direction (resolves #87)
+* All *MRS serializers/exporters take **kwargs (though many ignore them)
+  so that a common API can be used for, e.g., *MRS conversion.
 
 ### Fixed
 

diff --git a/README.md b/README.md
@@ -123,6 +123,8 @@ requirements include:
     REST client
   - [Pygments](http://pygments.org/) for TDL and SimpleMRS syntax
     highlighting
+  - [Penman](https://github.com/goodmami/penman) for PENMAN
+    serialization of DMRS and EDS
   - [tikz-dependency](https://www.ctan.org/pkg/tikz-dependency), while
     not a Python requirement, is needed for compiling LaTeX documents
     using exported DMRSs

diff --git a/delphin/main.py b/delphin/main.py
@@ -118,11 +118,12 @@
 import os
 import logging
 import json
+from functools import partial
 
 from docopt import docopt
 
 from delphin.__about__ import __version__
-from delphin.mrs import xmrs
+from delphin.mrs import xmrs, eds
 from delphin import itsdb
 
 
@@ -166,7 +167,8 @@ def convert(args):
         mrx,
         dmrx,
         eds,
-        simpledmrs
+        simpledmrs,
+        penman
     )
     from delphin.extra import latex
     codecs = dict([
@@ -176,18 +178,26 @@ def convert(args):
         ('eds', (eds.loads, eds.dumps)),
         ('mrs-json', (_mrs_json.loads, _mrs_json.dumps)),
         ('dmrs-json', (_dmrs_json.loads, _dmrs_json.dumps)),
+        ('eds-json', (_eds_json.loads, _eds_json.dumps)),
+        ('dmrs-penman', (partial(penman.loads, model=xmrs.Dmrs),
+                         partial(penman.dumps, model=xmrs.Dmrs))),
+        ('eds-penman', (partial(penman.loads, model=eds.Eds),
+                         partial(penman.dumps, model=eds.Eds))),
         ('simpledmrs', (None, simpledmrs.dumps)),
         ('dmrs-tikz', (None, latex.dmrs_tikz_dependency))
     ])
     decoders = set(k for k, cd in codecs.items() if cd[0])
     encoders = set(k for k, cd in codecs.items() if cd[1])
+
     # arg validation
     if args['--from'] not in decoders:
         sys.exit('Source format must be one of: {}'
                  .format(', '.join(sorted(decoders))))
     if args['--to'] not in encoders:
         sys.exit('Source format must be one of: {}'
                  .format(', '.join(sorted(encoders))))
+    if args['--from'].startswith('eds') and not args['--to'].startswith('eds'):
+        sys.exit('Conversion from EDS to non-EDS currently not supported.')
     args['--color'] = (
         args['--color'] == 'always' or
         (args['--color'] == 'auto' and sys.stdout.isatty())
@@ -340,15 +350,23 @@ def loads(self, s):
     def dumps(self, xs, pretty_print=False, indent=None, **kwargs):
         if pretty_print and indent is None:
             indent = 2
-        return json.dumps([self.CLS.to_dict(x) for x in xs], indent=indent)
-
+        return json.dumps(
+            [self.CLS.to_dict(
+                x if isinstance(x, self.CLS) else self.CLS.from_xmrs(x)
+             ) for x in xs],
+            indent=indent
+        )
 
 class _DMRS_JSON(_MRS_JSON):
     CLS = xmrs.Dmrs
 
+class _EDS_JSON(_MRS_JSON):
+    CLS = eds.Eds
+
 
 _mrs_json = _MRS_JSON()
 _dmrs_json = _DMRS_JSON()
+_eds_json = _EDS_JSON()
 
 # working with directories and profiles
 

diff --git a/delphin/mrs/eds.py b/delphin/mrs/eds.py
@@ -88,11 +88,10 @@ def to_dict(self, properties=True):
             if node.lnk is not None:
                 nd['lnk'] = {'from': node.cfrom, 'to': node.cto}
             if properties:
-                props = node.sortinfo
+                if node.cvarsort is not None:
+                    nd['type'] = node.cvarsort
+                props = node.properties
                 if props:
-                    if CVARSORT in props:
-                        nd['type'] = props[CVARSORT]
-                        del props[CVARSORT]
                     nd['properties'] = props
             if node.carg is not None:
                 nd['carg'] = node.carg
@@ -129,6 +128,67 @@ def from_dict(cls, d):
         nodes.sort(key=lambda n: (n.cfrom, -n.cto))
         return cls(top, nodes=nodes, edges=edges)
 
+    def to_triples(self, short_pred=True, properties=True):
+        node_triples, edge_triples = [], []
+        # sort nodeids just so top var is first
+        nodes = sorted(self.nodes(), key=lambda n: n.nodeid != self.top)
+        for node in nodes:
+            nid = node.nodeid
+            pred = node.pred.short_form() if short_pred else node.pred.string
+            node_triples.append((nid, 'predicate', pred))
+            if node.lnk:
+                node_triples.append((nid, 'lnk', '"{}"'.format(str(node.lnk))))
+            if node.carg:
+                node_triples.append((nid, 'carg', '"{}"'.format(node.carg)))
+            if properties:
+                if node.cvarsort is not None:
+                    node_triples.append((nid, 'type', props[CVARSORT]))
+                props = node.properties
+                node_triples.extend((nid, p, v) for p, v in props.items())
+            edge_triples.extend(
+                (nid, rargname, tgt)
+                for rargname, tgt in sorted(
+                    self.edges(nid).items(),
+                    key=lambda x: rargname_sortkey(x[0])
+                )
+            )
+        return node_triples + edge_triples
+
+    @classmethod
+    def from_triples(cls, triples):
+        lnk, surface, identifier = None, None, None
+        nids, nd, edges = [], {}, []
+        for src, rel, tgt in triples:
+            if src not in nd:
+                nids.append(src)
+                nd[src] = {'pred': None, 'lnk': None, 'carg': None, 'si': []}
+            if rel == 'predicate':
+                nd[src]['pred'] = Pred.string_or_grammar_pred(tgt)
+            elif rel == 'lnk':
+                cfrom, cto = tgt.strip('"<>').split(':')
+                nd[src]['lnk'] = Lnk.charspan(int(cfrom), int(cto))
+            elif rel == 'carg':
+                if (tgt[0], tgt[-1]) == ('"', '"'):
+                    tgt = tgt[1:-1]
+                nd[src]['carg'] = tgt
+            elif rel == 'type':
+                nd[src]['si'].append((CVARSORT, tgt))
+            elif rel.islower():
+                nd[src]['si'].append((rel, tgt))
+            else:
+                edges.append((src, rel, tgt))
+        nodes = [
+            Node(
+                nodeid=nid,
+                pred=nd[nid]['pred'],
+                sortinfo=nd[nid]['si'],
+                lnk=nd[nid]['lnk'],
+                carg=nd[nid]['carg']
+            ) for nid in nids
+        ]
+        top = nids[0] if nids else None
+        return cls(top=top, nodes=nodes, edges=edges)
+
 
 def _find_dependencies(m, eps):
     deps = {}

diff --git a/delphin/mrs/penman.py b/delphin/mrs/penman.py
@@ -0,0 +1,122 @@
+
+"""
+Serialization functions for the PENMAN graph format.
+
+Unlike other *MRS serializers, this one takes a *model* argument for
+the load(), loads(), dump(), and dumps() methods, which determines what
+the graph will look like. This is because DMRS and EDS (and possibly
+others) yield different graph structures, but both can be encoded as
+PENMAN graphs. In this sense, it's more like JSON formatting of *MRS.
+"""
+
+from __future__ import absolute_import, print_function
+
+import penman
+
+from delphin.mrs.config import LTOP_NODEID
+
+
+class XMRSCodec(penman.PENMANCodec):
+    TYPE_REL = 'predicate'
+    TOP_VAR = LTOP_NODEID
+    TOP_REL = 'top'
+
+
+def load(fh, model):
+    """
+    Deserialize PENMAN graphs from a file (handle or filename)
+
+    Args:
+        fh: filename or file object
+        model: the Xmrs subclass instantiated from decoded triples
+    Returns:
+        a list of objects (of class *model*)
+    """
+    graphs = penman.load(fh, cls=XMRSCodec)
+    xs = [model.from_triples(g.triples()) for g in graphs]
+    return xs
+
+
+def loads(s, model):
+    """
+    Deserialize PENMAN graphs from a string
+
+    Args:
+        s: a string containing PENMAN graphs
+        model: the Xmrs subclass instantiated from decoded triples
+    Returns:
+        a list of objects (of class *model*)
+    """
+    graphs = penman.loads(s, cls=XMRSCodec)
+    xs = [model.from_triples(g.triples()) for g in graphs]
+    return xs
+
+
+def dump(fh, xs, model=None, properties=False, indent=True, **kwargs):
+    """
+    Serialize [Xmrs] (or subclass) objects to PENMAN and write to a file
+
+    Args:
+        fh: filename or file object
+        xs: an iterator of [Xmrs] objects to serialize
+        model: the Xmrs subclass used to get triples
+        properties: if True, encode variable properties
+        indent: if True, adaptively indent; if False or None, don't
+            indent; if a non-negative integer N, indent N spaces per level
+        pretty_print: (deprecated) if set, it overrides indent
+    Returns:
+        None
+    """
+    text = dumps(
+        xs, model=model, properties=properties, indent=indent, **kwargs
+    )
+    if hasattr(file, 'write'):
+        print(text, file=file)
+    else:
+        with open(file, 'w') as fh:
+            print(text, file=fh)
+
+
+def dumps(xs, model=None, properties=False, indent=True, **kwargs):
+    """
+    Serialize [Xmrs] (or subclass) objects to PENMAN notation
+
+    Args:
+        xs: an iterator of [Xmrs] objects to serialize
+        model: the Xmrs subclass used to get triples
+        properties: if True, encode variable properties
+        indent: if True, adaptively indent; if False or None, don't
+            indent; if a non-negative integer N, indent N spaces per level
+        pretty_print: (deprecated) if set, it overrides indent
+    Returns:
+        the PENMAN serialization of *xs*
+    """
+    xs = list(xs)
+
+    if not xs:
+        return ''
+
+    if model is None:
+        model = xs[0].__class__
+
+    if not hasattr(model, 'to_triples'):
+        raise TypeError(
+            '{} class does not implement to_triples()'.format(model.__name__)
+        )
+
+    codec = XMRSCodec()
+    graphs = [
+        codec.triples_to_graph(
+            model.to_triples(model.from_xmrs(x), properties=properties)
+        )
+        for x in xs
+    ]
+
+    if 'pretty_print' in kwargs:
+        indent = kwargs['pretty_print']
+
+    return penman.dumps(graphs, cls=XMRSCodec, indent=indent)
+
+
+def _canonical_ids(ts):
+    return ts # [(str(s), r, str(t)) for s, r, t in ts]