diff --git a/packages/python-packages/doc-warden/README.md b/packages/python-packages/doc-warden/README.md index bc40d7c19d2..bbfc84790f7 100644 --- a/packages/python-packages/doc-warden/README.md +++ b/packages/python-packages/doc-warden/README.md @@ -5,13 +5,12 @@ Every CI build owned by the Azure-SDK team also needs to verify that the documen Features: * Enforces Readme Standards - - [x] Readmes present - - [ ] Readmes have appropriate contents - - [ ] Files issues for failed standards checks - - [ ] Exit code > 0 for issues discovered -* Generates report for included observed packages + - Readmes present - *completed* + - Readmes have appropriate contents - *completed* + - Files issues for failed standards checks - *pending* +* Generates report for included observed packages - *pending* -This package is under development, and as such Python version compatibility has not been finalized at this time. +This package is tested on Python 2.7 -> 3.8. ## Prerequisites This package is intended to be run as part of a pipeline within Azure DevOps. As such, [Python](https://www.python.org/downloads/) must be installed prior to attempting to install or use `Doc-Warden.` While `pip` comes pre-installed on most modern Python installs, if `pip` is an unrecognized command when attempting to install `warden`, run the following command **after** your Python installation is complete. @@ -42,8 +41,14 @@ Example usage: * Devops is a bit finicky with registering a console entry point, hence the `sudo` just on the installation. `sudo` is only required on devops machines. * Assumption is that the `.docsettings` file is placed at the root of the repository. - * To provide a different path (like `azure-sdk-for-java` does...), use: - * `ward scan -d $(Build.SourcesDirectory) -c $(Build.SourcesDirectory)/eng/.docsettings.yml` + +To provide a different path (like `azure-sdk-for-java` does...), use: + +``` + +/:> ward scan -d $(Build.SourcesDirectory) -c $(Build.SourcesDirectory)/eng/.docsettings.yml + +``` ##### Parameter Options @@ -102,6 +107,15 @@ A package is indicated by: * The presence of a `package.json` file +### Enforcing Readme Content + +`doc-warden` has the ability to check discovered readme files to ensure that a set of configured sections is present. How does it work? `doc-warden` will check each pattern present within `required_readme_sections` against all headers present within a target readme. If all the patterns match at least one header, the readme will pass content verification. + +Other Notes: +* A `section` title is any markdown or RST that will result in a `

` to `

` html tag. +* `warden` will content verify any `readme.rst` or `readme.md` file found outside the `omitted_paths` in the targeted repo. + * Case of the readme file title is ignored. + #### Control, the `.docsettings.yml` File, and You Special cases often need to be configured. It seems logical that there needs be a central location (per repo) to override conventional settings. To that end, a new `.docsettings.yml` file will be added to each repo. @@ -109,7 +123,7 @@ Special cases often need to be configured. It seems logical that there needs be ``` │ README.md -│ .docsettings.yml +│ .docsettings.yml │ └───.azure-pipelines │ │ @@ -126,6 +140,9 @@ omitted_paths: - archive/* language: java root_check_enabled: True +required_readme_sections: + - "(Client Library for Azure .*|Microsoft Azure SDK for .*)" + - Getting Started ``` The above configuration tells `warden`... @@ -136,6 +153,15 @@ The above configuration tells `warden`... Possible values for `language` right now are `['net', 'java', 'js', 'python']`. Greater than one target language is not currently supported. +##### `required_readme_sections` Configuration +This section instructs `warden` to verify that there is at least one matching section title for each provided `section` pattern in any discovered readme. Regex is fully supported. + +The two items listed from the example `.docsettings` file will: +- Match a header matched by a simple regex expression +- Match a header exactly titled "Getting Started" + +Note that the regex is surrounded by quotation marks where the regex will break `yml` parsing of the configuration file. + ## Provide Feedback -If you encounter any bugs or have suggestions, please file an issue [here]() and assign to `scbedd`. +If you encounter any bugs or have suggestions, please file an issue [here](https://github.com/Azure/azure-sdk-tools/issues) and assign to `scbedd`. diff --git a/packages/python-packages/doc-warden/setup.py b/packages/python-packages/doc-warden/setup.py index df35ac17263..53d2d7fc0d0 100644 --- a/packages/python-packages/doc-warden/setup.py +++ b/packages/python-packages/doc-warden/setup.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + from setuptools import setup, find_packages import setuptools @@ -25,7 +28,7 @@ description=DESCRIPTION, long_description=long_description, long_description_content_type='text/markdown', - url='https://github.com/Azure/azure-sdk-tools/packages/python-packages/', + url='https://github.com/Azure/azure-sdk-tools/', author='Microsoft Corporation', author_email='azuresdkengsysadmins@microsoft.com', @@ -45,7 +48,11 @@ ], packages=find_packages(), install_requires = [ - 'pyyaml', + 'pyyaml', # docsettings file parse + 'markdown2', # parsing markdown to html + 'docutils', # parsing rst to html + 'pygments', # docutils uses pygments for parsing rst to html + 'beautifulsoup4', # parsing of generated html 'pathlib' ], entry_points = { diff --git a/packages/python-packages/doc-warden/warden/WardenConfiguration.py b/packages/python-packages/doc-warden/warden/WardenConfiguration.py index 1ddc3dd3b66..a44c259d4a4 100644 --- a/packages/python-packages/doc-warden/warden/WardenConfiguration.py +++ b/packages/python-packages/doc-warden/warden/WardenConfiguration.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + from __future__ import print_function import argparse import yaml @@ -66,6 +69,11 @@ def __init__(self): except: self.omitted_paths = [] + try: + self.required_readme_sections = doc['required_readme_sections'] or [] + except: + self.required_readme_sections = [] + try: self.scan_language = args.scan_language or doc['language'] except: @@ -88,5 +96,6 @@ def dump(self): 'omitted_paths': self.omitted_paths, 'scan_language': self.scan_language, 'root_check_enabled': self.root_check_enabled, - 'verbose_output': self.verbose_output + 'verbose_output': self.verbose_output, + 'required_readme_sections': self.required_readme_sections } diff --git a/packages/python-packages/doc-warden/warden/__init__.py b/packages/python-packages/doc-warden/warden/__init__.py index 418a122512e..f032611cd99 100644 --- a/packages/python-packages/doc-warden/warden/__init__.py +++ b/packages/python-packages/doc-warden/warden/__init__.py @@ -1,28 +1,21 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + from .version import VERSION -from .enforce_readme_presence import * -from .WardenConfiguration import WardenConfiguration +from .enforce_readme_presence import find_missing_readmes +from .enforce_readme_content import verify_readme_content +from .WardenConfiguration import WardenConfiguration +from .warden_common import walk_directory_for_pattern, get_omitted_files +from .cmd_entry import console_entry_point -__all__ = ['WardenConfiguration', - 'DEFAULT_LOCATION', - 'return_true', - 'unrecognized_option', +__all__ = [ + 'WardenConfiguration', + 'find_missing_readmes', + 'verify_readme_content', 'console_entry_point', - 'scan_repo', - 'results', - 'check_package_readmes', - 'check_python_readmes', - 'check_js_readmes', - 'check_net_readmes', - 'is_net_csproj_package', - 'check_java_readmes', - 'is_java_pom_package_pom', - 'check_repo_root', - 'find_alongside_file', - 'get_file_sets', - 'get_omitted_files', 'walk_directory_for_pattern', - 'check_match', - 'parse_pom'] + 'get_omitted_files', + ] __version__ = VERSION diff --git a/packages/python-packages/doc-warden/warden/cmd_entry.py b/packages/python-packages/doc-warden/warden/cmd_entry.py new file mode 100644 index 00000000000..06c0fa3a7bf --- /dev/null +++ b/packages/python-packages/doc-warden/warden/cmd_entry.py @@ -0,0 +1,27 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +from __future__ import print_function + +from .enforce_readme_presence import find_missing_readmes +from .enforce_readme_content import verify_readme_content +from .WardenConfiguration import WardenConfiguration + +# CONFIGURATION. ENTRY POINT. EXECUTION. +def console_entry_point(): + cfg = WardenConfiguration() + print(cfg.dump()) + + command_selector = { + 'scan': scan, + } + + if cfg.command in command_selector: + command_selector.get(cfg.command)(cfg) + else: + print('Unrecognized command invocation {}.'.format(cfg.command)) + exit(1) + +def scan(config): + find_missing_readmes(config) + verify_readme_content(config) diff --git a/packages/python-packages/doc-warden/warden/enforce_readme_content.py b/packages/python-packages/doc-warden/warden/enforce_readme_content.py new file mode 100644 index 00000000000..1a48c9738f7 --- /dev/null +++ b/packages/python-packages/doc-warden/warden/enforce_readme_content.py @@ -0,0 +1,102 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +from __future__ import print_function + +import os +import markdown2 +import bs4 +import re +from .warden_common import check_match, walk_directory_for_pattern, get_omitted_files +from docutils import core +from docutils.writers.html4css1 import Writer,HTMLTranslator + +# fnmatch is case insensitive by default, just look for readme rst and md +README_PATTERNS = ['*/readme.md', '*/readme.rst'] + +# entry point +def verify_readme_content(config): + all_readmes = walk_directory_for_pattern(config.target_directory, README_PATTERNS) + omitted_readmes = get_omitted_files(config) + targeted_readmes = [readme for readme in all_readmes if readme not in omitted_readmes] + + readme_results = [] + + for readme in targeted_readmes: + ext = os.path.splitext(readme)[1] + if ext == '.rst': + readme_results.append(verify_rst_readme(readme, config)) + else: + readme_results.append(verify_md_readme(readme, config)) + + results([readme_tuple for readme_tuple in readme_results if readme_tuple[1]], config) + +# output results +def results(readmes_with_issues, config): + if len(readmes_with_issues): + print('{} readmes have missing required sections.'.format(len(readmes_with_issues))) + for readme_tuple in readmes_with_issues: + print(readme_tuple[0].replace(os.path.normpath(config.target_directory), '') + ' is missing headers with pattern(s):') + for missing_pattern in readme_tuple[1]: + print(' * {0}'.format(missing_pattern)) + exit(1) + +# parse rst to html, check for presence of appropriate sections +def verify_rst_readme(readme, config): + with open(readme, 'r') as f: + readme_content = f.read() + html_readme_content = rst_to_html(readme_content) + html_soup = bs4.BeautifulSoup(html_readme_content, "html.parser") + + missed_patterns = find_missed_sections(html_soup, config.required_readme_sections) + + return (readme, missed_patterns) + +# parse md to html, check for presence of appropriate sections +def verify_md_readme(readme, config): + with open(readme, 'r') as f: + readme_content = f.read() + html_readme_content = markdown2.markdown(readme_content) + html_soup = bs4.BeautifulSoup(html_readme_content, "html.parser") + + missed_patterns = find_missed_sections(html_soup, config.required_readme_sections) + + return (readme, missed_patterns) + +# within the entire readme, are there any missing sections that are expected? +def find_missed_sections(html_soup, patterns): + headers = html_soup.find_all(re.compile('^h[1-6]$')) + missed_patterns = [] + observed_patterns = [] + + for header in headers: + observed_patterns.extend(match_regex_set(header, patterns)) + + return list(set(patterns) - set(observed_patterns)) + +# checks a header tag (soup) against a set of configured patterns +def match_regex_set(header, patterns): + matching_patterns = [] + for pattern in patterns: + result = re.search(pattern, header.get_text()) + if result: + matching_patterns.append(pattern) + return matching_patterns + +# boilerplate for translating RST +class HTMLFragmentTranslator(HTMLTranslator): + def __init__(self, document): + HTMLTranslator.__init__(self, document) + self.head_prefix = ['','','','',''] + self.body_prefix = [] + self.body_suffix = [] + self.stylesheet = [] + def astext(self): + return ''.join(self.body) + +html_fragment_writer = Writer() +html_fragment_writer.translator_class = HTMLFragmentTranslator + +# utilize boilerplate +def rst_to_html(input_rst): + return core.publish_string(input_rst, writer = html_fragment_writer) diff --git a/packages/python-packages/doc-warden/warden/enforce_readme_presence.py b/packages/python-packages/doc-warden/warden/enforce_readme_presence.py index bee82b1225c..46626623dc6 100644 --- a/packages/python-packages/doc-warden/warden/enforce_readme_presence.py +++ b/packages/python-packages/doc-warden/warden/enforce_readme_presence.py @@ -1,20 +1,15 @@ -# ------------------------------------------------------------------------------ # Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See License.txt in the project -# root for license information. -# ------------------------------------------------------------------------------ +# Licensed under the MIT License. + from __future__ import print_function -import argparse -import yaml import pathlib import os import glob import xml.etree.ElementTree as ET import fnmatch import zipfile -import re -from .WardenConfiguration import WardenConfiguration +from .warden_common import check_match, walk_directory_for_pattern, get_omitted_files # python 3 transitioned StringIO to be part of `io` module. # python 2 needs the old version however @@ -23,36 +18,15 @@ except ImportError: from io import StringIO -DEFAULT_LOCATION = '.docsettings.yml' - -def return_true(param): - return True - # default option for handling an uncrecognized language def unrecognized_option(configuration): print('Argument {} provided is not a supported option'.format(configuration.scan_language)) exit(1) -# CONFIGURATION. ENTRY POINT. EXECUTION. -def console_entry_point(): - cfg = WardenConfiguration() - print(cfg.dump()) - - command_selector = { - 'scan': scan_repo, - } - - if cfg.command in command_selector: - command_selector.get(cfg.command)(cfg) - else: - print('Unrecognized command invocation {}.'.format(cfg.command)) - exit(1) - -def scan_repo(config): +def find_missing_readmes(config): missing_readme_paths = check_package_readmes(config) results(missing_readme_paths, config) - # print results def results(missing_readme_paths, config): if len(missing_readme_paths): @@ -168,41 +142,6 @@ def get_file_sets(configuration, target_pattern, lambda_check = None): return list(set(expected_locations) - set(omitted_files)), set(omitted_files).intersection(expected_locations) -# gets the set of files in the target directory that have explicitly been omitted in the config settings -def get_omitted_files(configuration): - target_directory = configuration.target_directory - omitted_paths = [] - dirs = configuration.omitted_paths or [] - - # single special case here. if wildcard match at the beginning, do not join, use the pattern as is - adjusted_dirs = [pattern if pattern.startswith("*") else os.path.join(target_directory, pattern) for pattern in dirs] - omitted_paths.extend(walk_directory_for_pattern(target_directory, adjusted_dirs, None)) - - return omitted_paths - -# Returns a list of files under a target directory. The files included will match any of the -# target_patterns AND the lambda_check function. -def walk_directory_for_pattern(target_directory, target_patterns, lambda_check = None): - expected_locations = [] - target_directory = os.path.normpath(target_directory) - normalized_target_patterns = [os.path.normpath(pattern) for pattern in target_patterns] - check_function = lambda_check or return_true - - # walk the folders, filter to the patterns established - for folder, subfolders, files in os.walk(target_directory): - for file in files: - file_path = os.path.join(folder, file) - if check_match(file_path, normalized_target_patterns) and check_function(file_path): - expected_locations.append(file_path) - - return expected_locations - -# we want to walk the files as few times as possible. as such, for omitted_files, we provide a SET -# of patterns that we want to omit. This function simply checks -def check_match(file_path, normalized_target_patterns): - return any([fnmatch.fnmatch(file_path, normalized_target_pattern) - for normalized_target_pattern in normalized_target_patterns]) - # namespaces in xml really mess with xmlTree: https://bugs.python.org/issue18304 # this function provides a workaround for both parsing an xml file as well as REMOVING said namespaces def parse_pom(file_path): diff --git a/packages/python-packages/doc-warden/warden/version.py b/packages/python-packages/doc-warden/warden/version.py index 1f6518e6db5..81343d72d76 100644 --- a/packages/python-packages/doc-warden/warden/version.py +++ b/packages/python-packages/doc-warden/warden/version.py @@ -1 +1 @@ -VERSION = '0.1.0' +VERSION = '0.2.0' diff --git a/packages/python-packages/doc-warden/warden/warden_common.py b/packages/python-packages/doc-warden/warden/warden_common.py new file mode 100644 index 00000000000..cde9c089b51 --- /dev/null +++ b/packages/python-packages/doc-warden/warden/warden_common.py @@ -0,0 +1,43 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import os +import fnmatch + +# Returns a list of files under a target directory. The files included will match any of the +# target_patterns AND the lambda_check function. +def walk_directory_for_pattern(target_directory, target_patterns, lambda_check = None): + expected_locations = [] + target_directory = os.path.normpath(target_directory) + normalized_target_patterns = [os.path.normpath(pattern) for pattern in target_patterns] + check_function = lambda_check or return_true + + # walk the folders, filter to the patterns established + for folder, subfolders, files in os.walk(target_directory): + for file in files: + file_path = os.path.join(folder, file) + if check_match(file_path, normalized_target_patterns) and check_function(file_path): + expected_locations.append(file_path) + + return expected_locations + +# gets the set of files in the target directory that have explicitly been omitted in the config settings +def get_omitted_files(configuration): + target_directory = configuration.target_directory + omitted_paths = [] + dirs = configuration.omitted_paths or [] + + # single special case here. if wildcard match at the beginning, do not join, use the pattern as is + adjusted_dirs = [pattern if pattern.startswith("*") else os.path.join(target_directory, pattern) for pattern in dirs] + omitted_paths.extend(walk_directory_for_pattern(target_directory, adjusted_dirs, None)) + + return omitted_paths + +# we want to walk the files as few times as possible. as such, for omitted_files, we provide a SET +# of patterns that we want to omit. This function simply checks +def check_match(file_path, normalized_target_patterns): + return any([fnmatch.fnmatch(file_path, normalized_target_pattern) + for normalized_target_pattern in normalized_target_patterns]) + +def return_true(param): + return True