Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOC: Adding script to validate docstrings, and generate list of all functions/methods with state #19898

Merged
merged 8 commits into from
Mar 2, 2018
355 changes: 355 additions & 0 deletions scripts/validate_docstrings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,355 @@
#!/usr/bin/env python
"""
Analyze docstrings to detect errors.

If no argument is provided, it does a quick check of docstrings and returns
a csv with all API functions and results of basic checks.

If a function or method is provided in the form "pandas.function",
"pandas.module.class.method", etc. a list of all errors in the docstring for
the specified function or method.

Usage::
$ ./validate_docstrings.py
$ ./validate_docstrings.py pandas.DataFrame.head
"""
import os
import sys
import csv
import re
import functools
import argparse
import contextlib
import inspect
import importlib
import doctest
import textwrap
try:
from io import StringIO
except ImportError:
from cStringIO import StringIO
import numpy

BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

sys.path.insert(0, os.path.join(BASE_PATH))
import pandas

sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext'))
from numpydoc.docscrape import NumpyDocString


def _to_original_callable(obj):
while True:
if inspect.isfunction(obj) or inspect.isclass(obj):
f = inspect.getfile(obj)
if f.startswith('<') and f.endswith('>'):
return None
return obj
if inspect.ismethod(obj):
obj = obj.__func__
elif isinstance(obj, functools.partial):
obj = obj.func
elif isinstance(obj, property):
obj = obj.fget
else:
return None


def _output_header(title, width=80, char='#'):
full_line = char * width
side_len = (width - len(title) - 2) // 2
adj = '' if len(title) % 2 == 0 else ' '
title_line = '{side} {title}{adj} {side}'.format(side=char * side_len,
title=title,
adj=adj)

return '\n{full_line}\n{title_line}\n{full_line}\n\n'.format(
full_line=full_line, title_line=title_line)


class Docstring:
def __init__(self, method_name, method_obj):
self.method_name = method_name
self.method_obj = method_obj
self.raw_doc = method_obj.__doc__ or ''
self.raw_doc = textwrap.dedent(self.raw_doc)
self.doc = NumpyDocString(self.raw_doc)

def __len__(self):
return len(self.raw_doc)

@property
def source_file_name(self):
fname = inspect.getsourcefile(self.method_obj)
if fname:
fname = os.path.relpath(fname, BASE_PATH)
return fname

@property
def source_file_def_line(self):
try:
return inspect.getsourcelines(self.method_obj)[-1]
except OSError:
pass

@property
def github_url(self):
url = 'https://github.com/pandas-dev/pandas/blob/master/'
url += '{}#L{}'.format(self.source_file_name,
self.source_file_def_line)
return url

@property
def first_line_blank(self):
if self.raw_doc:
return not bool(self.raw_doc.split('\n')[0].strip())

@property
def summary(self):
if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1:
return ''
return ' '.join(self.doc['Summary'])

@property
def extended_summary(self):
if not self.doc['Extended Summary'] and len(self.doc['Summary']) > 1:
return ' '.join(self.doc['Summary'])
return ' '.join(self.doc['Extended Summary'])

@property
def needs_summary(self):
return not (bool(self.summary) and bool(self.extended_summary))

@property
def doc_parameters(self):
return self.doc['Parameters']

@property
def signature_parameters(self):
if not inspect.isfunction(self.method_obj):
return tuple()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you do this? As the code below works fine for a class as well, eg pandas.DataFrame (while now the script will say the keywords and signature did not match, while in fact they do)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't research much, but if I introspect parameters from a non-function I get this error:

Traceback (most recent call last):
  File "./validate_docstrings.py", line 353, in <module>
    sys.exit(main(args.function))
  File "./validate_docstrings.py", line 337, in main
    return validate_all()
  File "./validate_docstrings.py", line 259, in validate_all
    int(doc.correct_parameters),
  File "./validate_docstrings.py", line 159, in correct_parameters
    return not bool(self.parameter_mismatches)
  File "./validate_docstrings.py", line 138, in parameter_mismatches
    signature_params = self.signature_parameters
  File "./validate_docstrings.py", line 130, in signature_parameters
    params = tuple(inspect.signature(self.method_obj).parameters.keys())
  File "/home/mgarcia/.anaconda3/envs/pandas_dev/lib/python3.6/inspect.py", line 3036, in signature
    return Signature.from_callable(obj, follow_wrapped=follow_wrapped)
  File "/home/mgarcia/.anaconda3/envs/pandas_dev/lib/python3.6/inspect.py", line 2786, in from_callable
    follow_wrapper_chains=follow_wrapped)
  File "/home/mgarcia/.anaconda3/envs/pandas_dev/lib/python3.6/inspect.py", line 2341, in _signature_from_callable
    'no signature found for builtin type {!r}'.format(obj))
ValueError: no signature found for builtin type <class 'pandas.core.indexing._AtIndexer'>

params = tuple(inspect.signature(self.method_obj).parameters.keys())
if params and params[0] in ('self', 'cls'):
return params[1:]
return params

@property
def parameter_mismatches(self):
errs = []
signature_params = self.signature_parameters
if self.doc_parameters:
doc_params = list(zip(*self.doc_parameters))[0]
else:
doc_params = []

missing = set(signature_params) - set(doc_params)
if missing:
errs.append('Parameters {!r} not documented'.format(missing))
extra = set(doc_params) - set(signature_params)
if extra:
errs.append('Unknown parameters {!r}'.format(extra))
if not missing and not extra and signature_params != doc_params:
errs.append('Wrong parameters order. ' +
'Actual: {!r}. '.format(signature_params) +
'Documented: {!r}'.format(doc_params))

return errs

@property
def correct_parameters(self):
return not bool(self.parameter_mismatches)

@property
def see_also(self):
return self.doc['See Also']

@property
def examples(self):
return self.doc['Examples']

@property
def first_line_ends_in_dot(self):
if self.doc:
return self.doc.split('\n')[0][-1] == '.'

@property
def deprecated(self):
pattern = re.compile('.. deprecated:: ')
return (self.method_name.startswith('pandas.Panel') or
bool(pattern.search(self.summary)) or
bool(pattern.search(self.extended_summary)))

@property
def examples_errors(self):
flags = doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL
finder = doctest.DocTestFinder()
runner = doctest.DocTestRunner(optionflags=flags)
context = {'np': numpy, 'pd': pandas}
error_msgs = ''
for test in finder.find(self.raw_doc, self.method_name, globs=context):
f = StringIO()
with contextlib.redirect_stdout(f):
runner.run(test)
error_msgs += f.getvalue()
return error_msgs


def get_api_items():
api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst')

position = None
with open(api_fname) as f:
for line in f:
if line.startswith('.. currentmodule::'):
current_module = line.replace('.. currentmodule::', '').strip()
continue

if line == '.. autosummary::\n':
position = 'autosummary'
continue

if position == 'autosummary':
if line == '\n':
position = 'items'
continue

if position == 'items':
if line == '\n':
position = None
continue
item = line.strip()
func = importlib.import_module(current_module)
for part in item.split('.'):
func = getattr(func, part)

yield '.'.join([current_module, item]), func


def validate_all():
writer = csv.writer(sys.stdout)
writer.writerow(['Function or method',
'Type',
'File',
'Code line',
'GitHub link',
'Is deprecated',
'Has summary',
'Has extended summary',
'Parameters ok',
'Has examples',
'Shared code with'])
seen = {}
for func_name, func in get_api_items():
obj_type = type(func).__name__
original_callable = _to_original_callable(func)
if original_callable is None:
writer.writerow([func_name, obj_type] + [''] * 9)
else:
doc = Docstring(func_name, original_callable)
key = doc.source_file_name, doc.source_file_def_line
shared_code = seen.get(key, '')
seen[key] = func_name
writer.writerow([func_name,
obj_type,
doc.source_file_name,
doc.source_file_def_line,
doc.github_url,
int(doc.deprecated),
int(bool(doc.summary)),
int(bool(doc.extended_summary)),
int(doc.correct_parameters),
int(bool(doc.examples)),
shared_code])

return 0


def validate_one(func_name):
for maxsplit in range(1, func_name.count('.') + 1):
# TODO when py3 only replace by: module, *func_parts = ...
func_name_split = func_name.rsplit('.', maxsplit=maxsplit)
module = func_name_split[0]
func_parts = func_name_split[1:]
try:
func_obj = importlib.import_module(module)
except ImportError:
pass
else:
continue

if 'module' not in locals():
raise ImportError('No module can be imported '
'from "{}"'.format(func_name))

for part in func_parts:
func_obj = getattr(func_obj, part)

doc = Docstring(func_name, func_obj)

sys.stderr.write(_output_header('Docstring ({})'.format(func_name)))
sys.stderr.write('{}\n'.format(doc.raw_doc))

errs = []
if not doc.summary:
errs.append('No summary found')
else:
if not doc.summary[0].isupper():
errs.append('Summary does not start with capital')
if doc.summary[-1] != '.':
errs.append('Summary does not end with dot')
if doc.summary.split(' ')[0][-1] == 's':
errs.append('Summary must start with infinitive verb, '
'not third person (e.g. use "Generate" instead of '
'"Generates")')
if not doc.extended_summary:
errs.append('No extended summary found')

param_errs = doc.parameter_mismatches
if param_errs:
errs.append('Errors in parameters section')
for param_err in param_errs:
errs.append('\t{}'.format(param_err))

examples_errs = ''
if not doc.examples:
errs.append('No examples section found')
else:
examples_errs = doc.examples_errors
if examples_errs:
errs.append('Examples do not pass tests')

sys.stderr.write(_output_header('Validation'))
if errs:
sys.stderr.write('Errors found:\n')
for err in errs:
sys.stderr.write('\t{}\n'.format(err))
else:
sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name))

if examples_errs:
sys.stderr.write(_output_header('Doctests'))
sys.stderr.write(examples_errs)

return len(errs)


def main(function):
if function is None:
return validate_all()
else:
return validate_one(function)


if __name__ == '__main__':
argparser = argparse.ArgumentParser(
description='validate pandas docstrings')
argparser.add_argument('function',
nargs='?',
default=None,
help=('function or method to validate '
'(e.g. pandas.DataFrame.head) '
'if not provided, all docstrings '
'are validated'))
args = argparser.parse_args()
sys.exit(main(args.function))