diff --git a/packages/python-packages/doc-warden/README.md b/packages/python-packages/doc-warden/README.md
index bc40d7c19d2..bbfc84790f7 100644
--- a/packages/python-packages/doc-warden/README.md
+++ b/packages/python-packages/doc-warden/README.md
@@ -5,13 +5,12 @@ Every CI build owned by the Azure-SDK team also needs to verify that the documen
* Enforces Readme Standards
- - [x] Readmes present
- - [ ] Readmes have appropriate contents
- - [ ] Files issues for failed standards checks
- - [ ] Exit code > 0 for issues discovered
-* Generates report for included observed packages
+ - Readmes present - *completed*
+ - Readmes have appropriate contents - *completed*
+ - Files issues for failed standards checks - *pending*
+* Generates report for included observed packages - *pending*
-This package is under development, and as such Python version compatibility has not been finalized at this time.
+This package is tested on Python 2.7 -> 3.8.
## Prerequisites
This package is intended to be run as part of a pipeline within Azure DevOps. As such, [Python](https://www.python.org/downloads/) must be installed prior to attempting to install or use `Doc-Warden.` While `pip` comes pre-installed on most modern Python installs, if `pip` is an unrecognized command when attempting to install `warden`, run the following command **after** your Python installation is complete.
@@ -42,8 +41,14 @@ Example usage:
* Devops is a bit finicky with registering a console entry point, hence the `sudo` just on the installation. `sudo` is only required on devops machines.
* Assumption is that the `.docsettings` file is placed at the root of the repository.
- * To provide a different path (like `azure-sdk-for-java` does...), use:
- * `ward scan -d $(Build.SourcesDirectory) -c $(Build.SourcesDirectory)/eng/.docsettings.yml`
+To provide a different path (like `azure-sdk-for-java` does...), use:
+/:> ward scan -d $(Build.SourcesDirectory) -c $(Build.SourcesDirectory)/eng/.docsettings.yml
##### Parameter Options
@@ -102,6 +107,15 @@ A package is indicated by:
* The presence of a `package.json` file
+### Enforcing Readme Content
+`doc-warden` has the ability to check discovered readme files to ensure that a set of configured sections is present. How does it work? `doc-warden` will check each pattern present within `required_readme_sections` against all headers present within a target readme. If all the patterns match at least one header, the readme will pass content verification.
+Other Notes:
+* A `section` title is any markdown or RST that will result in a `
` to `` html tag.
+* `warden` will content verify any `readme.rst` or `readme.md` file found outside the `omitted_paths` in the targeted repo.
+ * Case of the readme file title is ignored.
#### Control, the `.docsettings.yml` File, and You
Special cases often need to be configured. It seems logical that there needs be a central location (per repo) to override conventional settings. To that end, a new `.docsettings.yml` file will be added to each repo.
@@ -109,7 +123,7 @@ Special cases often need to be configured. It seems logical that there needs be
-│ .docsettings.yml
+│ .docsettings.yml
│ │
@@ -126,6 +140,9 @@ omitted_paths:
- archive/*
language: java
root_check_enabled: True
+ - "(Client Library for Azure .*|Microsoft Azure SDK for .*)"
+ - Getting Started
The above configuration tells `warden`...
@@ -136,6 +153,15 @@ The above configuration tells `warden`...
Possible values for `language` right now are `['net', 'java', 'js', 'python']`. Greater than one target language is not currently supported.
+##### `required_readme_sections` Configuration
+This section instructs `warden` to verify that there is at least one matching section title for each provided `section` pattern in any discovered readme. Regex is fully supported.
+The two items listed from the example `.docsettings` file will:
+- Match a header matched by a simple regex expression
+- Match a header exactly titled "Getting Started"
+Note that the regex is surrounded by quotation marks where the regex will break `yml` parsing of the configuration file.
## Provide Feedback
-If you encounter any bugs or have suggestions, please file an issue [here]() and assign to `scbedd`.
+If you encounter any bugs or have suggestions, please file an issue [here](https://github.com/Azure/azure-sdk-tools/issues) and assign to `scbedd`.
diff --git a/packages/python-packages/doc-warden/setup.py b/packages/python-packages/doc-warden/setup.py
index df35ac17263..53d2d7fc0d0 100644
--- a/packages/python-packages/doc-warden/setup.py
+++ b/packages/python-packages/doc-warden/setup.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
from setuptools import setup, find_packages
import setuptools
@@ -25,7 +28,7 @@
- url='https://github.com/Azure/azure-sdk-tools/packages/python-packages/',
+ url='https://github.com/Azure/azure-sdk-tools/',
author='Microsoft Corporation',
@@ -45,7 +48,11 @@
install_requires = [
- 'pyyaml',
+ 'pyyaml', # docsettings file parse
+ 'markdown2', # parsing markdown to html
+ 'docutils', # parsing rst to html
+ 'pygments', # docutils uses pygments for parsing rst to html
+ 'beautifulsoup4', # parsing of generated html
entry_points = {
diff --git a/packages/python-packages/doc-warden/warden/WardenConfiguration.py b/packages/python-packages/doc-warden/warden/WardenConfiguration.py
index 1ddc3dd3b66..a44c259d4a4 100644
--- a/packages/python-packages/doc-warden/warden/WardenConfiguration.py
+++ b/packages/python-packages/doc-warden/warden/WardenConfiguration.py
@@ -1,3 +1,6 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
from __future__ import print_function
import argparse
import yaml
@@ -66,6 +69,11 @@ def __init__(self):
self.omitted_paths = []
+ try:
+ self.required_readme_sections = doc['required_readme_sections'] or []
+ except:
+ self.required_readme_sections = []
self.scan_language = args.scan_language or doc['language']
@@ -88,5 +96,6 @@ def dump(self):
'omitted_paths': self.omitted_paths,
'scan_language': self.scan_language,
'root_check_enabled': self.root_check_enabled,
- 'verbose_output': self.verbose_output
+ 'verbose_output': self.verbose_output,
+ 'required_readme_sections': self.required_readme_sections
diff --git a/packages/python-packages/doc-warden/warden/__init__.py b/packages/python-packages/doc-warden/warden/__init__.py
index 418a122512e..f032611cd99 100644
--- a/packages/python-packages/doc-warden/warden/__init__.py
+++ b/packages/python-packages/doc-warden/warden/__init__.py
@@ -1,28 +1,21 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
from .version import VERSION
-from .enforce_readme_presence import *
-from .WardenConfiguration import WardenConfiguration
+from .enforce_readme_presence import find_missing_readmes
+from .enforce_readme_content import verify_readme_content
+from .WardenConfiguration import WardenConfiguration
+from .warden_common import walk_directory_for_pattern, get_omitted_files
+from .cmd_entry import console_entry_point
-__all__ = ['WardenConfiguration',
- 'return_true',
- 'unrecognized_option',
+__all__ = [
+ 'WardenConfiguration',
+ 'find_missing_readmes',
+ 'verify_readme_content',
- 'scan_repo',
- 'results',
- 'check_package_readmes',
- 'check_python_readmes',
- 'check_js_readmes',
- 'check_net_readmes',
- 'is_net_csproj_package',
- 'check_java_readmes',
- 'is_java_pom_package_pom',
- 'check_repo_root',
- 'find_alongside_file',
- 'get_file_sets',
- 'get_omitted_files',
- 'check_match',
- 'parse_pom']
+ 'get_omitted_files',
+ ]
__version__ = VERSION
diff --git a/packages/python-packages/doc-warden/warden/cmd_entry.py b/packages/python-packages/doc-warden/warden/cmd_entry.py
new file mode 100644
index 00000000000..06c0fa3a7bf
--- /dev/null
+++ b/packages/python-packages/doc-warden/warden/cmd_entry.py
@@ -0,0 +1,27 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import print_function
+from .enforce_readme_presence import find_missing_readmes
+from .enforce_readme_content import verify_readme_content
+from .WardenConfiguration import WardenConfiguration
+def console_entry_point():
+ cfg = WardenConfiguration()
+ print(cfg.dump())
+ command_selector = {
+ 'scan': scan,
+ }
+ if cfg.command in command_selector:
+ command_selector.get(cfg.command)(cfg)
+ else:
+ print('Unrecognized command invocation {}.'.format(cfg.command))
+ exit(1)
+def scan(config):
+ find_missing_readmes(config)
+ verify_readme_content(config)
diff --git a/packages/python-packages/doc-warden/warden/enforce_readme_content.py b/packages/python-packages/doc-warden/warden/enforce_readme_content.py
new file mode 100644
index 00000000000..1a48c9738f7
--- /dev/null
+++ b/packages/python-packages/doc-warden/warden/enforce_readme_content.py
@@ -0,0 +1,102 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import print_function
+import os
+import markdown2
+import bs4
+import re
+from .warden_common import check_match, walk_directory_for_pattern, get_omitted_files
+from docutils import core
+from docutils.writers.html4css1 import Writer,HTMLTranslator
+# fnmatch is case insensitive by default, just look for readme rst and md
+README_PATTERNS = ['*/readme.md', '*/readme.rst']
+# entry point
+def verify_readme_content(config):
+ all_readmes = walk_directory_for_pattern(config.target_directory, README_PATTERNS)
+ omitted_readmes = get_omitted_files(config)
+ targeted_readmes = [readme for readme in all_readmes if readme not in omitted_readmes]
+ readme_results = []
+ for readme in targeted_readmes:
+ ext = os.path.splitext(readme)[1]
+ if ext == '.rst':
+ readme_results.append(verify_rst_readme(readme, config))
+ else:
+ readme_results.append(verify_md_readme(readme, config))
+ results([readme_tuple for readme_tuple in readme_results if readme_tuple[1]], config)
+# output results
+def results(readmes_with_issues, config):
+ if len(readmes_with_issues):
+ print('{} readmes have missing required sections.'.format(len(readmes_with_issues)))
+ for readme_tuple in readmes_with_issues:
+ print(readme_tuple[0].replace(os.path.normpath(config.target_directory), '') + ' is missing headers with pattern(s):')
+ for missing_pattern in readme_tuple[1]:
+ print(' * {0}'.format(missing_pattern))
+ exit(1)
+# parse rst to html, check for presence of appropriate sections
+def verify_rst_readme(readme, config):
+ with open(readme, 'r') as f:
+ readme_content = f.read()
+ html_readme_content = rst_to_html(readme_content)
+ html_soup = bs4.BeautifulSoup(html_readme_content, "html.parser")
+ missed_patterns = find_missed_sections(html_soup, config.required_readme_sections)
+ return (readme, missed_patterns)
+# parse md to html, check for presence of appropriate sections
+def verify_md_readme(readme, config):
+ with open(readme, 'r') as f:
+ readme_content = f.read()
+ html_readme_content = markdown2.markdown(readme_content)
+ html_soup = bs4.BeautifulSoup(html_readme_content, "html.parser")
+ missed_patterns = find_missed_sections(html_soup, config.required_readme_sections)
+ return (readme, missed_patterns)
+# within the entire readme, are there any missing sections that are expected?
+def find_missed_sections(html_soup, patterns):
+ headers = html_soup.find_all(re.compile('^h[1-6]$'))
+ missed_patterns = []
+ observed_patterns = []
+ for header in headers:
+ observed_patterns.extend(match_regex_set(header, patterns))
+ return list(set(patterns) - set(observed_patterns))
+# checks a header tag (soup) against a set of configured patterns
+def match_regex_set(header, patterns):
+ matching_patterns = []
+ for pattern in patterns:
+ result = re.search(pattern, header.get_text())
+ if result:
+ matching_patterns.append(pattern)
+ return matching_patterns
+# boilerplate for translating RST
+class HTMLFragmentTranslator(HTMLTranslator):
+ def __init__(self, document):
+ HTMLTranslator.__init__(self, document)
+ self.head_prefix = ['','','','','']
+ self.body_prefix = []
+ self.body_suffix = []
+ self.stylesheet = []
+ def astext(self):
+ return ''.join(self.body)
+html_fragment_writer = Writer()
+html_fragment_writer.translator_class = HTMLFragmentTranslator
+# utilize boilerplate
+def rst_to_html(input_rst):
+ return core.publish_string(input_rst, writer = html_fragment_writer)
diff --git a/packages/python-packages/doc-warden/warden/enforce_readme_presence.py b/packages/python-packages/doc-warden/warden/enforce_readme_presence.py
index bee82b1225c..46626623dc6 100644
--- a/packages/python-packages/doc-warden/warden/enforce_readme_presence.py
+++ b/packages/python-packages/doc-warden/warden/enforce_readme_presence.py
@@ -1,20 +1,15 @@
-# ------------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
-# Licensed under the MIT License. See License.txt in the project
-# root for license information.
-# ------------------------------------------------------------------------------
+# Licensed under the MIT License.
from __future__ import print_function
-import argparse
-import yaml
import pathlib
import os
import glob
import xml.etree.ElementTree as ET
import fnmatch
import zipfile
-import re
-from .WardenConfiguration import WardenConfiguration
+from .warden_common import check_match, walk_directory_for_pattern, get_omitted_files
# python 3 transitioned StringIO to be part of `io` module.
# python 2 needs the old version however
@@ -23,36 +18,15 @@
except ImportError:
from io import StringIO
-DEFAULT_LOCATION = '.docsettings.yml'
-def return_true(param):
- return True
# default option for handling an uncrecognized language
def unrecognized_option(configuration):
print('Argument {} provided is not a supported option'.format(configuration.scan_language))
-def console_entry_point():
- cfg = WardenConfiguration()
- print(cfg.dump())
- command_selector = {
- 'scan': scan_repo,
- }
- if cfg.command in command_selector:
- command_selector.get(cfg.command)(cfg)
- else:
- print('Unrecognized command invocation {}.'.format(cfg.command))
- exit(1)
-def scan_repo(config):
+def find_missing_readmes(config):
missing_readme_paths = check_package_readmes(config)
results(missing_readme_paths, config)
# print results
def results(missing_readme_paths, config):
if len(missing_readme_paths):
@@ -168,41 +142,6 @@ def get_file_sets(configuration, target_pattern, lambda_check = None):
return list(set(expected_locations) - set(omitted_files)), set(omitted_files).intersection(expected_locations)
-# gets the set of files in the target directory that have explicitly been omitted in the config settings
-def get_omitted_files(configuration):
- target_directory = configuration.target_directory
- omitted_paths = []
- dirs = configuration.omitted_paths or []
- # single special case here. if wildcard match at the beginning, do not join, use the pattern as is
- adjusted_dirs = [pattern if pattern.startswith("*") else os.path.join(target_directory, pattern) for pattern in dirs]
- omitted_paths.extend(walk_directory_for_pattern(target_directory, adjusted_dirs, None))
- return omitted_paths
-# Returns a list of files under a target directory. The files included will match any of the
-# target_patterns AND the lambda_check function.
-def walk_directory_for_pattern(target_directory, target_patterns, lambda_check = None):
- expected_locations = []
- target_directory = os.path.normpath(target_directory)
- normalized_target_patterns = [os.path.normpath(pattern) for pattern in target_patterns]
- check_function = lambda_check or return_true
- # walk the folders, filter to the patterns established
- for folder, subfolders, files in os.walk(target_directory):
- for file in files:
- file_path = os.path.join(folder, file)
- if check_match(file_path, normalized_target_patterns) and check_function(file_path):
- expected_locations.append(file_path)
- return expected_locations
-# we want to walk the files as few times as possible. as such, for omitted_files, we provide a SET
-# of patterns that we want to omit. This function simply checks
-def check_match(file_path, normalized_target_patterns):
- return any([fnmatch.fnmatch(file_path, normalized_target_pattern)
- for normalized_target_pattern in normalized_target_patterns])
# namespaces in xml really mess with xmlTree: https://bugs.python.org/issue18304
# this function provides a workaround for both parsing an xml file as well as REMOVING said namespaces
def parse_pom(file_path):
diff --git a/packages/python-packages/doc-warden/warden/version.py b/packages/python-packages/doc-warden/warden/version.py
index 1f6518e6db5..81343d72d76 100644
--- a/packages/python-packages/doc-warden/warden/version.py
+++ b/packages/python-packages/doc-warden/warden/version.py
@@ -1 +1 @@
-VERSION = '0.1.0'
+VERSION = '0.2.0'
diff --git a/packages/python-packages/doc-warden/warden/warden_common.py b/packages/python-packages/doc-warden/warden/warden_common.py
new file mode 100644
index 00000000000..cde9c089b51
--- /dev/null
+++ b/packages/python-packages/doc-warden/warden/warden_common.py
@@ -0,0 +1,43 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+import os
+import fnmatch
+# Returns a list of files under a target directory. The files included will match any of the
+# target_patterns AND the lambda_check function.
+def walk_directory_for_pattern(target_directory, target_patterns, lambda_check = None):
+ expected_locations = []
+ target_directory = os.path.normpath(target_directory)
+ normalized_target_patterns = [os.path.normpath(pattern) for pattern in target_patterns]
+ check_function = lambda_check or return_true
+ # walk the folders, filter to the patterns established
+ for folder, subfolders, files in os.walk(target_directory):
+ for file in files:
+ file_path = os.path.join(folder, file)
+ if check_match(file_path, normalized_target_patterns) and check_function(file_path):
+ expected_locations.append(file_path)
+ return expected_locations
+# gets the set of files in the target directory that have explicitly been omitted in the config settings
+def get_omitted_files(configuration):
+ target_directory = configuration.target_directory
+ omitted_paths = []
+ dirs = configuration.omitted_paths or []
+ # single special case here. if wildcard match at the beginning, do not join, use the pattern as is
+ adjusted_dirs = [pattern if pattern.startswith("*") else os.path.join(target_directory, pattern) for pattern in dirs]
+ omitted_paths.extend(walk_directory_for_pattern(target_directory, adjusted_dirs, None))
+ return omitted_paths
+# we want to walk the files as few times as possible. as such, for omitted_files, we provide a SET
+# of patterns that we want to omit. This function simply checks
+def check_match(file_path, normalized_target_patterns):
+ return any([fnmatch.fnmatch(file_path, normalized_target_pattern)
+ for normalized_target_pattern in normalized_target_patterns])
+def return_true(param):
+ return True