diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..0e259d4 --- /dev/null +++ b/LICENSE @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/README.md b/README.md index 3fc3aa8..8b6b0ab 100644 --- a/README.md +++ b/README.md @@ -45,5 +45,5 @@ optional arguments: ``` -Written by hoelty, license: CC0. +Written by hoelty, revised by mueslimak3r, license: CC0. diff --git a/litepubify.py b/litepubify.py index b932bac..c77df31 100644 --- a/litepubify.py +++ b/litepubify.py @@ -25,6 +25,7 @@ import uuid import xml.sax.saxutils as saxutils import zipfile +import bs4 # python 2 / 3 compatibility code try: @@ -84,7 +85,7 @@ def main(): debug("title: '{}', author: '{}', memberpage: '{}'".format(title, author, memberpage_url)) memberpage, _ = fetch_url(memberpage_url) - (all_oneshots, all_series) = parse_story_list(memberpage) + (all_oneshots, all_series) = parse_author_works_page(memberpage) if args.debug: debug('ALL STORIES BY AUTHOR {}:'.format(author)) @@ -100,16 +101,29 @@ def main(): for url in args.url: story_html, _ = fetch_url(url) page_id = extract_id(url) + print("target id [%s]" % str(page_id)) found_story = None found_series = None + print("searching oneshots for target id [%s]" % str(page_id)) for st in all_oneshots: + print(extract_id(st.url)) if extract_id(st.url) == page_id: found_story = st found_oneshots_and_series.append(found_story) break if not found_story: + print("searching series for target id [%s]" % str(page_id)) for series in all_series: + print("searching series for target id [%s]" % str(page_id)) + if extract_id(series.url) == page_id: + found_series = series + found_story = series.stories[0] + if args.single: + found_oneshots_and_series.append(found_story) + else: + found_oneshots_and_series.append(series) + break for story in series.stories: if extract_id(story.url) == page_id: found_story = story @@ -149,28 +163,26 @@ def parse_commandline_arguments(): parser.add_argument('--disk-cache-path', metavar='PATH', help='Path for the disk cache (optional, usually not required). If this option is specified, downloaded websites are cached in a file and loaded from disk in subsequent runs (when this option is used again with the same path). This is mainly useful for testing, to avoid repeated downloads. Without this option, litepubify keeps everything in memory and only writes the final epub file to disk.') args = parser.parse_args() -def parse_story_header(html): - """Parses the header of the story html to find title, author and the link to the author's memberpage. - - Args: - html (text): the full html text of the story - Returns: - title, author and memberpage url as a 3-tuple - - """ - header_match = re.search(r'
(.*?)
', html, flags=re.DOTALL) - if not header_match: - error("Cannot find header in html.") - header_match2 = re.search(r'

(.*?)

.*?(.*?)', header_match.group(1), flags=re.DOTALL) - if not header_match2: - error("Cannot parse header.") - title = header_match2.group(1) - memberpage_url = header_match2.group(2) - memberpage_url = re.sub(r'&', r'&', memberpage_url) - memberpage_url = re.sub(r'^//', r'http://', memberpage_url) - author = header_match2.group(3) - return (title, author, memberpage_url) +def parse_story_header(html): + soup = bs4.BeautifulSoup(html, 'html.parser') + title_tag = soup.find('h1', class_='headline') + if title_tag: + title = title_tag.text.strip() + else: + error("Cannot find title in html.") + #element = soup.find('a', class_='icon-account-plus') + element = soup.find('div', {'title' : 'Stories'}) + #print(element) + if element: + url = str((element.find_all("a", recursive=False))[0]['href']) + # url = "/".join(url.split('/')[:-1]) + author = url.split('/')[-2] + else: + error("Cannot find author link in html.") + error("Cannot find author's member page link in html.") + return title, author, url + def make_epub_from_stories_and_series(stories_and_series, author): """Make epub file from story or series. @@ -343,74 +355,106 @@ def out(self, data): def get_output(self): return self.accum -def parse_story_list(html): - """Parse the list of stories from the submissions section of the author's memberpage. - """ - author_match = re.search(r'(.*?).*?', html, flags=re.DOTALL) - if not author_match: +def validate_classes(element, rules): + classes = element["class"] + required_classes = rules[0] + excluded_classes = rules[1] + if not required_classes or not classes: + return False + + rc_idx = 0 + found_classes = [] + for c in classes: + for ec in excluded_classes: + if c.startswith(ec): + return False + if rc_idx < len(required_classes): + for rc in required_classes[rc_idx:]: + if c.startswith(rc): + found_classes.append(c) + rc_idx += 1 + return len(list(set(found_classes))) == len(required_classes) + +def parse_series_page(page_url, author): + html, _ = fetch_url(page_url) + soup = bs4.BeautifulSoup(html, 'html.parser') + chapters_container = soup.select("ul[class=series__works]")[0] + chapter_elements = chapters_container.find_all('li', recursive=False) + stories = [] + for chapter_elem in chapter_elements: + story = Story() + title_elem = chapter_elem.find_all('a', recursive=False)[0] + story.url = title_elem['href'] + story.title = title_elem.text.strip() + story.author = author + + subtitle_elem = chapter_elem.find_all('p', recursive=False)[0] + story.category = subtitle_elem.find_all('a', recursive=False)[0].text.strip() + teaser = subtitle_elem.text.strip() + story.teaser = teaser.replace(story.category, '').strip() + + story.rating = 0.0 + story.hot = False + story.date = "" + stories.append(story) + return stories + +def parse_author_works_page(html): + soup = bs4.BeautifulSoup(html, 'html.parser') + author_element = soup.find('title') + if not author_element: error("Cannot determine author on member page.") - author = author_match.group(1) - - subm_table_match = re.search(r'.*?', html, flags=re.DOTALL) + if "Stories by " in author_element.text.strip(): + author = author_element.text.strip().replace("Stories by ", "").strip() + else: + error("Cannot determine author on member page.") + subm_table_match = soup.select("div[class^=_works_wrapper]") if not subm_table_match: - error("Cannot find list of submissions on member page.") + error("Cannot find submission table.") + subm_table_match = subm_table_match[0] + trs = subm_table_match.find_all('div', recursive=False) + #trs = subm_table_match.select("div[class^=_works_item]", recursive=False) + print("trs %s " % len(trs)) - trs = re.findall(r'()', subm_table_match.group(1), re.DOTALL) + #trs = re.findall(r'()', subm_table_match.group(1), re.DOTALL) all_series = [] all_oneshots = [] series = None story = None + ONESHOT_CLASS = (['_works_item'], []) + SERIES_CLASS = (['_works_item__series_expanded_header_card'], []) for tr in trs: - if tr.startswith(r''): # series title - series_title_match = re.search(r'(.*?)', tr) - if not series_title_match: - error("Cannot find series title: '{}'".format(tr)) + #print(tr["class"]) + if validate_classes(tr, SERIES_CLASS): + print("Series: " + tr.select("a[class^=_item_title]")[0].text) series = Series() - series.title = series_title_match.group(1) - series.title = re.sub(r': \d+ Part Series$', '', series.title) + series.title = tr.select("a[class^=_item_title]")[0].text.strip() series.author = author + series_url = tr.select("a[class^=_item_title]")[0]['href'] + series.url = series_url + print(series_url) + series.stories = parse_series_page(series_url, author) all_series.append(series) - elif tr.startswith(r'') or tr.startswith(r'(.*?).*?[(](.*?)[)]', tds[0]) - if not td0_match: error("Couldn't match 1st field: '{}'".format(tds[0])) + elif validate_classes(tr, ONESHOT_CLASS): + print("Story: " + tr.select("a[class^=_item_title]")[0].text) + story_stats_elem = tr.select("div[class^=_stats]")[0] story = Story() - story.url = td0_match.group(1) - story.url = re.sub(r'^//', r'http://', story.url) - story.title = td0_match.group(2) - story.title = re.sub(r'||', '', story.title) + story.title = tr.select("a[class^=_item_title]")[0].text.strip() story.author = author - story.rating = td0_match.group(3) - - td1_match = re.search(r'^\s*([^<]*)(<|$)', tds[1], flags=re.DOTALL | re.UNICODE) - if not td1_match: error("Couldn't match 2nd field: '{}'".format(tds[1])) - story.teaser = td1_match.group(1) - story.teaser = story.teaser.strip() - if re.search(r'ico_h.gif', tds[1]): - story.hot = True - else: - story.hot = False - - td2_match = re.search(r'(.*?)', tds[2]) - story.category = td2_match.group(1) - - td3_match = re.search(r'\s*(.+)\s*', tds[3]) - story.date = td3_match.group(1) - - if tr.startswith(r''): - series.stories.append(story) - story = None + story.url = tr.select("a[class^=_item_title]")[0]['href'] + if not story.url.startswith('https://www.literotica.com'): + story.url = "https://www.literotica.com" + story.url + rating_elem = story_stats_elem.find('span', {'title' : 'Rating'}) + if rating_elem: + story.rating = rating_elem.find_all('span', recursive=False)[0].text.strip() else: - series = None - all_oneshots.append(story) - story = None - elif tr.startswith(r''): # ignore - pass - else: - error("Unkown row type: '{}'".format(tr)) - + story.rating = "0.0" + story.hot = True if story_stats_elem.find('span', {'title' : 'Hot'}) else False + story.category = tr.select("a[class^=_item_category]")[0].text.strip() + story.date = tr.select("span[class^=_date_approve]")[0].text.strip() + story.teaser = "" if not tr.select("p[class^=_item_description]") else tr.select("p[class^=_item_description]")[0].text.strip() + all_oneshots.append(story) + print("total oneshots [%d], total series [%d]" % (len(all_oneshots), len(all_series))) return (all_oneshots, all_series) @@ -429,40 +473,66 @@ def extract_id(url): p = re.sub('/$', '', p) idx = p.rfind('/') if idx == -1: error("unexpected url: {}".format(url)) - return p[idx+1:] + url_id = p[idx+1:] + print("url_id [%s]" % url_id) + return url_id def get_story_text(st): + print('getting text') html, _ = fetch_url(st.url) # assuming url leads to first page and has no query part - sel_match = re.search(r'
(.*?)
', html) - if not sel_match: error("Couldn't find page selection part.") - vals = re.findall('', sel_match.group(1)) - if not vals: # just one page - vals = ['1'] - complete_text = None - for v in vals: - url = st.url + '?page=' + v - if v == '1': - url = st.url - html, _ = fetch_url(url) - text_match = re.search(r'
.*?
(.*?)
', html, re.DOTALL) - if not text_match: error("Couldn't find text body.") - text = text_match.group(1) - text = text.strip() - strip_outer_p_match = re.search(r'^

(.*)

$', text, re.DOTALL) - if strip_outer_p_match: - text = strip_outer_p_match.group(1) - - - if complete_text == None: - complete_text = text + soup = bs4.BeautifulSoup(html, 'html.parser') + paginator_parent_element = soup.find('span', {'title' : 'Previous Page'}) + if paginator_parent_element: + paginator_parent_element = paginator_parent_element.parent + paginator_elements = paginator_parent_element.find_all('a', recursive=False) + else: + paginator_elements = [ 1 ] + #sel_match = re.search(r'
(.*?)
', html) + + #[0].select("div[class^=_item_title]")[0]['href'] + + #vals = re.findall('', sel_match.group(1)) + complete_text = "" + + end = 1 + if paginator_parent_element: + for pe in paginator_elements: + if pe.text.strip() == '' or not pe.text.strip().isnumeric(): + continue + if int(pe.text.strip()) > end: + end = int(pe.text.strip()) + + for idx in range(1, end+1): + print("page %s" % idx) + url_suffix = "" if idx == 1 else "?page={current_page}".format(current_page=idx) + url = st.url + url_suffix + + story_page_html, _ = fetch_url(url) + story_page_soup = bs4.BeautifulSoup(story_page_html, 'html.parser') + #print(story_page_soup) + #text_parent = story_page_soup.find('div', {'panel' : 'article'}) + text_parent = story_page_soup.select('.panel.article')[0] + if not text_parent: + error("Couldn't find text body.") + + #print(text_parent) + + text_elements = text_parent.find_all('p', recursive=True) + lines = "" + for elem in text_elements: + #print(elem) + #print(elem.text) + if elem.text is None or elem.text == "": + continue + #print(elem.text) + lines += "\n{}".format(str(elem)) if lines != "" else str(elem) + if idx > 1: + complete_text += '\n\n' + lines else: - complete_text += '\n\n' + text - - if not complete_text: + complete_text = lines + if complete_text == "": warning('Unable to extract text for {}.'.format(st.url)) - complete_text = '

{}

'.format(complete_text) - - return complete_text + return "

%s

" % complete_text class FrozenClass(object): @@ -478,7 +548,6 @@ def _freeze(self): self.__isfrozen = True - class Story(FrozenClass): """A single story. @@ -520,6 +589,7 @@ class Series(FrozenClass): def __init__(self): self.title = None self.author = None + self.url = None self.stories = [] self._freeze() @@ -557,8 +627,9 @@ def fetch_url(url, binary=False): mime_type = mime_type.decode('UTF-8') url_mem_cache[url] = (data, mime_type) return data, mime_type - info("downloading '{}'...".format(url)) + info("fetching '{}'...".format(url)) req = compat_urllib_request.Request(url, headers={ 'User-Agent': get_user_agent() }) + # req = compat_urllib_request.Request(url, headers={ 'User-Agent': get_user_agent(), 'Cookie': 'enable_classic=1' }) response = compat_urllib_request.urlopen(req) data = response.read() mime_type = get_content_type(response)