Skip to content

Commit

Permalink
Resolve #85: add PENMAN serialization
Browse files Browse the repository at this point in the history
This is made available by `to_triples()` and `from_triples()` methods on Xmrs subclasses.
  • Loading branch information
goodmami committed Jan 18, 2017
1 parent bfc876b commit c11f82f
Show file tree
Hide file tree
Showing 7 changed files with 315 additions and 16 deletions.
20 changes: 16 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,25 @@

This release replaces the top-level `pyDelphin` and `mrs.py` scripts with
`delphin.sh` (when installed, a `delphin` command is made available that
accomplishes the same thing).
accomplishes the same thing). It also introduces the PENMAN codec for
DMRS and EDS.

### Added

* `delphin.sh` top-level script (replaces the former `pyDelphin` and `mrs.py`)
* `delphin.sh` top-level script (replaces the former `pyDelphin` and
`mrs.py`)
* `delphin.main` module for script usage
* `mrs-json` and `dmrs-json` codecs for the `convert` script command
(currently only for Python 3, however)
* `mrs-json`, `dmrs-json`, and `eds-json` codecs for the `convert`
script command
* `dmrs-penman` and `eds-penman` codecs for the `convert` script command
* Skeleton creation from sentence lists in the `mkprof` script command
* `delphin.mrs.xmrs.Xmrs.from_xmrs()` (meant to be used by subclasses)
* `delphin.mrs.xmrs.Dmrs.to_triples()`
* `delphin.mrs.xmrs.Dmrs.to_triples()`
* `delphin.mrs.eds.Eds.to_triples()`
* `delphin.mrs.eds.Eds.from_triples()`
* `delphin.mrs.penman` module for PENMAN serialization of DMRS and EDS
(resolves #85)

### Changed

Expand All @@ -25,6 +35,8 @@ accomplishes the same thing).
* Quantifiers are detected more consistently for, e.g., DMRS conversion;
this mostly resolves #84
* DMRS `/EQ` links are now `MOD/EQ` and fix a direction (resolves #87)
* All *MRS serializers/exporters take **kwargs (though many ignore them)
so that a common API can be used for, e.g., *MRS conversion.

### Fixed

Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@ requirements include:
REST client
- [Pygments](http://pygments.org/) for TDL and SimpleMRS syntax
highlighting
- [Penman](https://github.com/goodmami/penman) for PENMAN
serialization of DMRS and EDS
- [tikz-dependency](https://www.ctan.org/pkg/tikz-dependency), while
not a Python requirement, is needed for compiling LaTeX documents
using exported DMRSs
Expand Down
26 changes: 22 additions & 4 deletions delphin/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,12 @@
import os
import logging
import json
from functools import partial

from docopt import docopt

from delphin.__about__ import __version__
from delphin.mrs import xmrs
from delphin.mrs import xmrs, eds
from delphin import itsdb


Expand Down Expand Up @@ -166,7 +167,8 @@ def convert(args):
mrx,
dmrx,
eds,
simpledmrs
simpledmrs,
penman
)
from delphin.extra import latex
codecs = dict([
Expand All @@ -176,18 +178,26 @@ def convert(args):
('eds', (eds.loads, eds.dumps)),
('mrs-json', (_mrs_json.loads, _mrs_json.dumps)),
('dmrs-json', (_dmrs_json.loads, _dmrs_json.dumps)),
('eds-json', (_eds_json.loads, _eds_json.dumps)),
('dmrs-penman', (partial(penman.loads, model=xmrs.Dmrs),
partial(penman.dumps, model=xmrs.Dmrs))),
('eds-penman', (partial(penman.loads, model=eds.Eds),
partial(penman.dumps, model=eds.Eds))),
('simpledmrs', (None, simpledmrs.dumps)),
('dmrs-tikz', (None, latex.dmrs_tikz_dependency))
])
decoders = set(k for k, cd in codecs.items() if cd[0])
encoders = set(k for k, cd in codecs.items() if cd[1])

# arg validation
if args['--from'] not in decoders:
sys.exit('Source format must be one of: {}'
.format(', '.join(sorted(decoders))))
if args['--to'] not in encoders:
sys.exit('Source format must be one of: {}'
.format(', '.join(sorted(encoders))))
if args['--from'].startswith('eds') and not args['--to'].startswith('eds'):
sys.exit('Conversion from EDS to non-EDS currently not supported.')
args['--color'] = (
args['--color'] == 'always' or
(args['--color'] == 'auto' and sys.stdout.isatty())
Expand Down Expand Up @@ -340,15 +350,23 @@ def loads(self, s):
def dumps(self, xs, pretty_print=False, indent=None, **kwargs):
if pretty_print and indent is None:
indent = 2
return json.dumps([self.CLS.to_dict(x) for x in xs], indent=indent)

return json.dumps(
[self.CLS.to_dict(
x if isinstance(x, self.CLS) else self.CLS.from_xmrs(x)
) for x in xs],
indent=indent
)

class _DMRS_JSON(_MRS_JSON):
CLS = xmrs.Dmrs

class _EDS_JSON(_MRS_JSON):
CLS = eds.Eds


_mrs_json = _MRS_JSON()
_dmrs_json = _DMRS_JSON()
_eds_json = _EDS_JSON()

# working with directories and profiles

Expand Down
68 changes: 64 additions & 4 deletions delphin/mrs/eds.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,10 @@ def to_dict(self, properties=True):
if node.lnk is not None:
nd['lnk'] = {'from': node.cfrom, 'to': node.cto}
if properties:
props = node.sortinfo
if node.cvarsort is not None:
nd['type'] = node.cvarsort
props = node.properties
if props:
if CVARSORT in props:
nd['type'] = props[CVARSORT]
del props[CVARSORT]
nd['properties'] = props
if node.carg is not None:
nd['carg'] = node.carg
Expand Down Expand Up @@ -129,6 +128,67 @@ def from_dict(cls, d):
nodes.sort(key=lambda n: (n.cfrom, -n.cto))
return cls(top, nodes=nodes, edges=edges)

def to_triples(self, short_pred=True, properties=True):
node_triples, edge_triples = [], []
# sort nodeids just so top var is first
nodes = sorted(self.nodes(), key=lambda n: n.nodeid != self.top)
for node in nodes:
nid = node.nodeid
pred = node.pred.short_form() if short_pred else node.pred.string
node_triples.append((nid, 'predicate', pred))
if node.lnk:
node_triples.append((nid, 'lnk', '"{}"'.format(str(node.lnk))))
if node.carg:
node_triples.append((nid, 'carg', '"{}"'.format(node.carg)))
if properties:
if node.cvarsort is not None:
node_triples.append((nid, 'type', props[CVARSORT]))
props = node.properties
node_triples.extend((nid, p, v) for p, v in props.items())
edge_triples.extend(
(nid, rargname, tgt)
for rargname, tgt in sorted(
self.edges(nid).items(),
key=lambda x: rargname_sortkey(x[0])
)
)
return node_triples + edge_triples

@classmethod
def from_triples(cls, triples):
lnk, surface, identifier = None, None, None
nids, nd, edges = [], {}, []
for src, rel, tgt in triples:
if src not in nd:
nids.append(src)
nd[src] = {'pred': None, 'lnk': None, 'carg': None, 'si': []}
if rel == 'predicate':
nd[src]['pred'] = Pred.string_or_grammar_pred(tgt)
elif rel == 'lnk':
cfrom, cto = tgt.strip('"<>').split(':')
nd[src]['lnk'] = Lnk.charspan(int(cfrom), int(cto))
elif rel == 'carg':
if (tgt[0], tgt[-1]) == ('"', '"'):
tgt = tgt[1:-1]
nd[src]['carg'] = tgt
elif rel == 'type':
nd[src]['si'].append((CVARSORT, tgt))
elif rel.islower():
nd[src]['si'].append((rel, tgt))
else:
edges.append((src, rel, tgt))
nodes = [
Node(
nodeid=nid,
pred=nd[nid]['pred'],
sortinfo=nd[nid]['si'],
lnk=nd[nid]['lnk'],
carg=nd[nid]['carg']
) for nid in nids
]
top = nids[0] if nids else None
return cls(top=top, nodes=nodes, edges=edges)


def _find_dependencies(m, eps):
deps = {}
Expand Down
122 changes: 122 additions & 0 deletions delphin/mrs/penman.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@

"""
Serialization functions for the PENMAN graph format.
Unlike other *MRS serializers, this one takes a *model* argument for
the load(), loads(), dump(), and dumps() methods, which determines what
the graph will look like. This is because DMRS and EDS (and possibly
others) yield different graph structures, but both can be encoded as
PENMAN graphs. In this sense, it's more like JSON formatting of *MRS.
"""

from __future__ import absolute_import, print_function

import penman

from delphin.mrs.config import LTOP_NODEID


class XMRSCodec(penman.PENMANCodec):
TYPE_REL = 'predicate'
TOP_VAR = LTOP_NODEID
TOP_REL = 'top'


def load(fh, model):
"""
Deserialize PENMAN graphs from a file (handle or filename)
Args:
fh: filename or file object
model: the Xmrs subclass instantiated from decoded triples
Returns:
a list of objects (of class *model*)
"""
graphs = penman.load(fh, cls=XMRSCodec)
xs = [model.from_triples(g.triples()) for g in graphs]
return xs


def loads(s, model):
"""
Deserialize PENMAN graphs from a string
Args:
s: a string containing PENMAN graphs
model: the Xmrs subclass instantiated from decoded triples
Returns:
a list of objects (of class *model*)
"""
graphs = penman.loads(s, cls=XMRSCodec)
xs = [model.from_triples(g.triples()) for g in graphs]
return xs


def dump(fh, xs, model=None, properties=False, indent=True, **kwargs):
"""
Serialize [Xmrs] (or subclass) objects to PENMAN and write to a file
Args:
fh: filename or file object
xs: an iterator of [Xmrs] objects to serialize
model: the Xmrs subclass used to get triples
properties: if True, encode variable properties
indent: if True, adaptively indent; if False or None, don't
indent; if a non-negative integer N, indent N spaces per level
pretty_print: (deprecated) if set, it overrides indent
Returns:
None
"""
text = dumps(
xs, model=model, properties=properties, indent=indent, **kwargs
)
if hasattr(file, 'write'):
print(text, file=file)
else:
with open(file, 'w') as fh:
print(text, file=fh)


def dumps(xs, model=None, properties=False, indent=True, **kwargs):
"""
Serialize [Xmrs] (or subclass) objects to PENMAN notation
Args:
xs: an iterator of [Xmrs] objects to serialize
model: the Xmrs subclass used to get triples
properties: if True, encode variable properties
indent: if True, adaptively indent; if False or None, don't
indent; if a non-negative integer N, indent N spaces per level
pretty_print: (deprecated) if set, it overrides indent
Returns:
the PENMAN serialization of *xs*
"""
xs = list(xs)

if not xs:
return ''

if model is None:
model = xs[0].__class__

if not hasattr(model, 'to_triples'):
raise TypeError(
'{} class does not implement to_triples()'.format(model.__name__)
)

codec = XMRSCodec()
graphs = [
codec.triples_to_graph(
model.to_triples(model.from_xmrs(x), properties=properties)
)
for x in xs
]

if 'pretty_print' in kwargs:
indent = kwargs['pretty_print']

return penman.dumps(graphs, cls=XMRSCodec, indent=indent)


def _canonical_ids(ts):
return ts # [(str(s), r, str(t)) for s, r, t in ts]
Loading

0 comments on commit c11f82f

Please sign in to comment.