From 524e4daa56ead7e551f2c9c0d5ea9460884ad521 Mon Sep 17 00:00:00 2001
From: Eric Norige <127622562+eanorige@users.noreply.github.com>
Date: Thu, 26 Oct 2023 11:54:38 -0700
Subject: [PATCH] Speedup line_offset property (#1392)

* Replace dynamic regex with string find operation
* Add cache of where each line starts so we don't have quadratic behavior identifying line numbers when importing large chunks of html
---
 docs/changelog.md      |  6 ++++++
 markdown/htmlparser.py | 22 +++++++++++++---------
 2 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/docs/changelog.md b/docs/changelog.md
index 2f9e9250..614177c6 100644
--- a/docs/changelog.md
+++ b/docs/changelog.md
@@ -8,6 +8,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). See the [Contributing Guide](contributing.md) for details.
 
+## [unreleased]
+
+### Fixed
+
+* Fix a performance problem with HTML extraction where large HTML input could trigger quadratic line counting behavior (PR#1392).
+
 ## [3.5] -- 2023-10-06
 
 ### Added
diff --git a/markdown/htmlparser.py b/markdown/htmlparser.py
index bf70b73d..4dbb1587 100644
--- a/markdown/htmlparser.py
+++ b/markdown/htmlparser.py
@@ -83,6 +83,8 @@ def __init__(self, md, *args, **kwargs):
         # Block tags that should contain no content (self closing)
         self.empty_tags = set(['hr'])
 
+        self.lineno_start_cache = [0]
+
         # This calls self.reset
         super().__init__(*args, **kwargs)
         self.md = md
@@ -94,6 +96,8 @@ def reset(self):
         self.stack = []  # When `inraw==True`, stack contains a list of tags
         self._cache = []
         self.cleandoc = []
+        self.lineno_start_cache = [0]
+
         super().reset()
 
     def close(self):
@@ -114,15 +118,15 @@ def close(self):
     @property
     def line_offset(self) -> int:
         """Returns char index in `self.rawdata` for the start of the current line. """
-        if self.lineno > 1 and '\n' in self.rawdata:
-            m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
-            if m:
-                return m.end()
-            else:  # pragma: no cover
-                # Value of `self.lineno` must exceed total number of lines.
-                # Find index of beginning of last line.
-                return self.rawdata.rfind('\n')
-        return 0
+        for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
+            last_line_start_pos = self.lineno_start_cache[ii]
+            lf_pos = self.rawdata.find('\n', last_line_start_pos)
+            if lf_pos == -1:
+                # No more newlines found. Use end of raw data as start of line beyond end.
+                lf_pos = len(self.rawdata)
+            self.lineno_start_cache.append(lf_pos+1)
+
+        return self.lineno_start_cache[self.lineno-1]
 
     def at_line_start(self) -> bool:
         """