Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DOC: Adding script to validate docstrings, and generate list of all functions/methods with state #19898

Merged
merged 8 commits into from
Mar 2, 2018
296 changes: 296 additions & 0 deletions scripts/validate_docstrings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
#!/usr/bin/env python
"""
Analyze docstrings to detect errors.

If no argument is provided, it does a quick check of docstrings and returns
a csv with all API functions and results of basic checks.

If a function or method is provided in the form "pandas.function",
"pandas.module.class.method", etc. a list of all errors in the docstring for
the specified function or method.

Usage::
$ ./validate_docstrings.py
$ ./validate_docstrings.py pandas.DataFrame.head
"""
import os
import sys
import csv
import re
import functools
import argparse
import inspect
import importlib
import doctest
import numpy

BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))

sys.path.insert(0, os.path.join(BASE_PATH))
import pandas

sys.path.insert(1, os.path.join(BASE_PATH, 'doc', 'sphinxext'))
from numpydoc.docscrape import NumpyDocString


def _to_original_callable(obj):
while True:
if inspect.isfunction(obj) or inspect.isclass(obj):
f = inspect.getfile(obj)
if f.startswith('<') and f.endswith('>'):
return None
return obj
if inspect.ismethod(obj):
obj = obj.__func__
elif isinstance(obj, functools.partial):
obj = obj.func
elif isinstance(obj, property):
obj = obj.fget
else:
return None


class Docstring:
def __init__(self, method_name, method_obj):
self.method_name = method_name
self.method_obj = method_obj
self.raw_doc = method_obj.__doc__ or ''
self.doc = NumpyDocString(self.raw_doc)

def __len__(self):
return len(self.raw_doc)

@property
def source_file_name(self):
fname = inspect.getsourcefile(self.method_obj)
if fname:
fname = os.path.relpath(fname, BASE_PATH)
return fname

@property
def source_file_def_line(self):
try:
return inspect.getsourcelines(self.method_obj)[-1]
except OSError:
pass

@property
def github_url(self):
url = 'https://github.com/pandas-dev/pandas/blob/master/'
url += '{}#L{}'.format(self.source_file_name,
self.source_file_def_line)
return url

@property
def first_line_blank(self):
if self.raw_doc:
return not bool(self.raw_doc.split('\n')[0].strip())

@property
def summary(self):
return ' '.join(self.doc['Summary'])

@property
def extended_summary(self):
return ' '.join(self.doc['Extended Summary'])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This could return the summary, depending on how long the summary is. Eg is the summary is longer than 2 lines, regard it as an extended summary, and that there is no summary line.

Becuase now eg for the DataFrame.resample docstring, I get the message "No extended summary found", while actually it has one, but the summary line is missing, so that would be a more helpful message.


@property
def needs_summary(self):
return not (bool(self.summary) and bool(self.extended_summary))

@property
def doc_parameters(self):
return self.doc['Parameters']

@property
def signature_parameters(self):
if not inspect.isfunction(self.method_obj):
return tuple()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason you do this? As the code below works fine for a class as well, eg pandas.DataFrame (while now the script will say the keywords and signature did not match, while in fact they do)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't research much, but if I introspect parameters from a non-function I get this error:

Traceback (most recent call last):
  File "./validate_docstrings.py", line 353, in <module>
    sys.exit(main(args.function))
  File "./validate_docstrings.py", line 337, in main
    return validate_all()
  File "./validate_docstrings.py", line 259, in validate_all
    int(doc.correct_parameters),
  File "./validate_docstrings.py", line 159, in correct_parameters
    return not bool(self.parameter_mismatches)
  File "./validate_docstrings.py", line 138, in parameter_mismatches
    signature_params = self.signature_parameters
  File "./validate_docstrings.py", line 130, in signature_parameters
    params = tuple(inspect.signature(self.method_obj).parameters.keys())
  File "/home/mgarcia/.anaconda3/envs/pandas_dev/lib/python3.6/inspect.py", line 3036, in signature
    return Signature.from_callable(obj, follow_wrapped=follow_wrapped)
  File "/home/mgarcia/.anaconda3/envs/pandas_dev/lib/python3.6/inspect.py", line 2786, in from_callable
    follow_wrapper_chains=follow_wrapped)
  File "/home/mgarcia/.anaconda3/envs/pandas_dev/lib/python3.6/inspect.py", line 2341, in _signature_from_callable
    'no signature found for builtin type {!r}'.format(obj))
ValueError: no signature found for builtin type <class 'pandas.core.indexing._AtIndexer'>

params = tuple(inspect.signature(self.method_obj).parameters.keys())
if params and params[0] in ('self', 'cls'):
return params[1:]
return params

@property
def correct_parameters(self):
if self.doc_parameters:
doc_param_names = list(zip(*self.doc_parameters))[0]
return doc_param_names == self.signature_parameters
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We could return here the non-fitting ones to make the message more informative.

non_matching = set(doc_param_names) ^ set(signature_parameters)

(or do a difference in both directions to be even more explicit where it is missing, but that makes it a bit more complex)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Another thing: do we care about the order? Because now for DataFrame.interpolate I got the message that they did not message, but if I print both lists:

(Pdb) doc.signature_parameters
('method', 'axis', 'limit', 'inplace', 'limit_direction', 'limit_area', 'downcast', 'kwargs')
(Pdb) p list(zip(*doc.doc_parameters))[0]
('method', 'axis', 'limit', 'limit_direction', 'limit_area', 'inplace', 'downcast', 'kwargs')

So this will be very confusing I think. So either we need to say that in the error message, or for now not care about the order (I would maybe go for this last one, as in some cases it can be done on purpose, like putting a deprecated one at the end of the docstring, but for compatibility you need to keep it in its current place)


return not bool(self.signature_parameters)

@property
def see_also(self):
return self.doc['See Also']

@property
def examples(self):
return self.doc['Examples']

@property
def first_line_ends_in_dot(self):
if self.doc:
return self.doc.split('\n')[0][-1] == '.'

@property
def deprecated(self):
pattern = re.compile('.. deprecated:: ')
return (self.method_name.startswith('pandas.Panel') or
bool(pattern.search(self.summary)) or
bool(pattern.search(self.extended_summary)))

@property
def examples_pass_tests(self):
flags = doctest.NORMALIZE_WHITESPACE
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you also add the doctest.IGNORE_EXCEPTION_DETAIL ? (I think it is summing them)

finder = doctest.DocTestFinder()
runner = doctest.DocTestRunner(optionflags=flags)
context = {'np': numpy, 'pd': pandas}
total_failed = 0
for test in finder.find(self.raw_doc, self.method_name, globs=context):
total_failed += runner.run(test).failed
return total_failed == 0


def get_api_items():
api_fname = os.path.join(BASE_PATH, 'doc', 'source', 'api.rst')

position = None
with open(api_fname) as f:
for line in f:
if line.startswith('.. currentmodule::'):
current_module = line.replace('.. currentmodule::', '').strip()
continue

if line == '.. autosummary::\n':
position = 'autosummary'
continue

if position == 'autosummary':
if line == '\n':
position = 'items'
continue

if position == 'items':
if line == '\n':
position = None
continue
item = line.strip()
func = importlib.import_module(current_module)
for part in item.split('.'):
func = getattr(func, part)

yield '.'.join([current_module, item]), func


def validate_all():
writer = csv.writer(sys.stdout)
writer.writerow(['Function or method',
'Type',
'File',
'Code line',
'GitHub link',
'Is deprecated',
'Has summary',
'Has extended summary',
'Parameters ok',
'Has examples',
'Shared code with'])
seen = {}
for func_name, func in get_api_items():
obj_type = type(func).__name__
original_callable = _to_original_callable(func)
if original_callable is None:
writer.writerow([func_name, obj_type] + [''] * 9)
else:
doc = Docstring(func_name, original_callable)
key = doc.source_file_name, doc.source_file_def_line
shared_code = seen.get(key, '')
seen[key] = func_name
writer.writerow([func_name,
obj_type,
doc.source_file_name,
doc.source_file_def_line,
doc.github_url,
int(doc.deprecated),
int(bool(doc.summary)),
int(bool(doc.extended_summary)),
int(doc.correct_parameters),
int(bool(doc.examples)),
shared_code])

return 0


def validate_one(func_name):
for maxsplit in range(1, func_name.count('.') + 1):
# TODO when py3 only replace by: module, *func_parts = ...
func_name_split = func_name.rsplit('.', maxsplit=maxsplit)
module = func_name_split[0]
func_parts = func_name_split[1:]
try:
func_obj = importlib.import_module(module)
except ImportError:
pass
else:
continue

if 'module' not in locals():
raise ImportError('No module can be imported '
'from "{}"'.format(func_name))

for part in func_parts:
func_obj = getattr(func_obj, part)

doc = Docstring(func_name, func_obj)

errs = []
if not doc.summary:
errs.append('No summary found')
else:
if not doc.summary[0].isupper():
errs.append('Summary does not start with capital')
if doc.summary[-1] != '.':
errs.append('Summary does not end with dot')
if doc.summary.split(' ')[0][-1] == 's':
errs.append('Summary must start with infinitive verb, '
'not third person (e.g. use "Generate" instead of '
'"Generates")')
if not doc.extended_summary:
errs.append('No extended summary found')
if not doc.correct_parameters:
errs.append('Documented parameters do not match the signature')
if not doc.examples:
errs.append('No examples')
else:
if not doc.examples_pass_tests:
errs.append('Examples do not pass test')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe add between brackets "(see output above)" ?


if errs:
sys.stderr.write('Errors for "{}" docstring:\n'.format(func_name))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe start the string with a \n, so there is a blank line between the doctest output and this summary

for err in errs:
sys.stderr.write('\t* {}\n'.format(err))
else:
sys.stderr.write('Docstring for "{}" correct. :)\n'.format(func_name))

return len(errs)


def main(function):
if function is None:
return validate_all()
else:
return validate_one(function)


if __name__ == '__main__':
argparser = argparse.ArgumentParser(
description='validate pandas docstrings')
argparser.add_argument('function',
nargs='?',
default=None,
help=('function or method to validate '
'(e.g. pandas.DataFrame.head) '
'if not provided, all docstrings '
'are validated'))
args = argparser.parse_args()
sys.exit(main(args.function))