Skip to content

Commit

Permalink
Merge pull request #109 from RealFakeAccount/master
Browse files Browse the repository at this point in the history
add xlsx wordlist generator
  • Loading branch information
SethClydesdale authored Nov 9, 2020
2 parents cab952e + 881fd5f commit 7fd28e8
Show file tree
Hide file tree
Showing 38 changed files with 108 additions and 1 deletion.
4 changes: 4 additions & 0 deletions resources/javascript/homepage.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
content : 'Looking for more self-study resources? Visit the official <a href="http://genki.japantimes.co.jp/self_en">self-study room</a> for Genki or check out some of the resources in the <a href="https://github.com/SethClydesdale/genki-study-resources#resources-for-studying-japanese">readme</a> on GitHub. If you use Anki to study vocab, you can find decks for the vocab on Genki Study Resources <a href="' + getPaths() + 'help/anki-decks/">here</a>!'
},

{
content : 'You can now find xlsx vocabulary lists for Genki in our <a href="https://github.com/SethClydesdale/genki-study-resources/tree/master/resources/tools/wordlist_E-J">Github</a>!'
},

{
content : 'Have a question about the site? Check out the <a href="' + getPaths() + 'help/">FAQ</a>! If you can\'t find an answer to your question, feel free to contact us via <a href="https://github.com/SethClydesdale/genki-study-resources/issues">GitHub\'s issues</a> and we\'ll try to answer your question in a timely manner.'
},
Expand Down
21 changes: 20 additions & 1 deletion resources/tools/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,4 +88,23 @@ python3 anki_decks_maker.py ../../lessons

You can also generate decks by executing `anki_decks_maker-run.bat` and typing either `2nd` or `3rd` to generate a deck for that edition.

All of the decks created are currently available under the [decks](decks/) folder
All of the decks created are currently available under the [decks](decks/) folder


### wordlist_E-J.py
* Requires python 3.6+.

Create xlsx wordlist with words and their English meaning to memorise vocabulary,


```shell script
python3 wordlist_E-J.py <path_to_lessons_folder>

# For example:
python3 wordlist_E-J.py ../../lessons-3rd
python3 wordlist_E-J.py ../../lessons
```

You can also generate decks by executing `wordlist_E-J-run.bat` and typing either `2nd` or `3rd` to generate a deck for that edition.

All of the lists created are currently available under the [wordlists_E-J](wordlists_E-J/) folder
9 changes: 9 additions & 0 deletions resources/tools/wordlist_E-J-run.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
@echo off
title xlsx wordlist Maker for Genki Study Resources
color 1F

set /p id="Type 2nd or 3rd, then press ENTER to generate Anki decks for that edition. "

if %id% == 2nd (python wordlist_E-J.py ../../lessons) else if %id% == 3rd (python wordlist_E-J.py ../../lessons-3rd) else (echo No edition selected, please press any key to terminate the program.)

pause
75 changes: 75 additions & 0 deletions resources/tools/wordlist_E-J.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import re
import ast
import sys
from pathlib import Path
from itertools import chain

import openpyxl

lessons_folder = Path(sys.argv[1])
title_regex = re.compile(r'<title>(.*):(.*)- Lesson')
quizlet_regex = re.compile(r'quizlet : (.*?})', flags=re.S)
filter_regex = re.compile(r"format : 'kanji'|format : 'practice'|format : 'hirakata'|type : 'fill'|type : 'drawing'|type : 'stroke'|type : 'multi'|type : 'writing'|<title>Review:|Kanji Practice: Match the Readings|Kanji Practice: Match the Sentences|Kanji Practice: Match the Verbs|Katakana Practice: Countries and Capitals", flags=re.S)
output_folder = Path('.').absolute().joinpath('wordlist_E-J').joinpath(lessons_folder.name)


def get_tags(html):
"""
<title>Useful Expressions: Time (Minutes 11-30) - Lesson 1 | Genki ...</title>
Useful_Expressions , Time_(Minutes_11-30)
"""
match = title_regex.search(html)
return match.group(1).strip().replace(' ', '_'), match.group(2).strip().replace(' ', '_')


def get_vocab(html):
return ast.literal_eval(quizlet_regex.search(html).group(1).replace(r'//', '#'))


def main():
try :
print('Creating folder for xlsx...')
output_folder.mkdir(parents=True, exist_ok=False)
except Exception:
print('Folder already exists, skipping this step.')

workbooks = list()

for lesson_folder in lessons_folder.glob('lesson*'):
lesson_number = lesson_folder.name.split('-')[-1]

print(f'Getting vocab for Lesson {lesson_number}...')

wb = openpyxl.Workbook()
sheet = wb.active
sheet.cell(1, 1).value = "English"
sheet.cell(1, 2).value = "Japanese"
row_num = 2

for vocab_folder in chain(lesson_folder.glob('vocab*'), lesson_folder.glob('literacy*')):
with open(vocab_folder.joinpath('index.html'), 'r', encoding='UTF8') as f:
html = f.read()
if filter_regex.search(html) == None: # Filter out exercise types that are NOT vocab
try:
vocab = get_vocab(html)
except Exception:
print(f'Failed parsing of lesson-{lesson_number}, vocab file {vocab_folder}')
continue
for jp, eng in vocab.items():
eng = re.sub(r"\<(.*?)\>", '', eng)
sheet.cell(row_num, 1).value = eng
sheet.cell(row_num, 2).value = jp

row_num += 1

workbooks.append((wb, lesson_folder.name))

for wb, name in workbooks:
print(f'Creating deck for {name}...');
wb.save(output_folder.joinpath(f'{name}.xlsx'))

print('All xlsx list for the selected edition have been generated!')


if __name__ == '__main__':
main()
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 7fd28e8

Please sign in to comment.