-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathconvert_notes.py
596 lines (465 loc) · 18.8 KB
/
convert_notes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
import argparse
import logging
import os
import re
import shutil
import typing
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s")
parser = argparse.ArgumentParser()
parser.add_argument("--logseq", help="base directory of logseq graph", required=True)
parser.add_argument(
"--output", help="base directory where output should go", required=True
)
parser.add_argument(
"--overwrite_output",
dest="overwrite_output",
default=False,
action="store_true",
help="overwrites output directory if included",
)
parser.add_argument(
"--unindent_once",
default=False,
action="store_true",
help="unindents all lines once - lines at the highest level will have their bullet point removed",
)
parser.add_argument(
"--journal_dashes",
default=False,
action="store_true",
help="use dashes in daily journal - e.g. 2023-12-03.md",
)
parser.add_argument(
"--tag_prop_to_taglist",
default=False,
action="store_true",
help="convert tags in tags:: property to a list of tags in front matter",
)
parser.add_argument(
"--ignore_dot_for_namespaces",
default=False,
action="store_true",
help="ignore the use of '.' as a namespace character",
)
parser.add_argument(
"--convert_tags_to_links",
default=False,
action="store_true",
help="Convert #[[long tags]] to [[long tags]]",
)
# Global state isn't always bad mmkay
ORIGINAL_LINE = ""
INSIDE_CODE_BLOCK = False
def is_markdown_file(fpath: str) -> bool:
return os.path.splitext(fpath)[-1].lower() == ".md"
def is_empty_markdown_file(fpath: str) -> bool:
"""Given a path to a markdown file, checks if it's empty
A file is empty if it only contains whitespace
A file containing only front matter / page properties is not empty
"""
if not is_markdown_file(fpath):
return False
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
lines = f.readlines()
for line in lines:
if not line.isspace():
return False
return True
def get_markdown_file_properties(fpath: str) -> tuple[dict, int]:
"""Given a path to a markdown file, returns a dictionary of its properties and the index of the first line after the properties
Properties can either be in page property format: "title:: test"
Or in front matter format:
---
title: test
---
"""
raise NotImplementedError()
def get_namespace_hierarchy(fname: str) -> list[str]:
"""Given a markdown filename (not full path) representing a logseq page, returns a list representing the namespace
hierarchy for that file
Eg a file in the namespace "A/B/C" would return ['A', 'B', 'C.md']
Namespaces are detected as follows ways:
Splitting by "%2F" in the file name
Splitting by "___" in the file name if the above is not present
Splitting by "." in the file name if the above is not present and the --ignore_dot_for_namespaces flag is not present
"""
split_by_pct = fname.split("%2F")
if len(split_by_pct) > 1:
return split_by_pct
split_by_underscores = fname.split("___")
if len(split_by_underscores) > 1:
return split_by_underscores
if not args.ignore_dot_for_namespaces:
split_by_dot = fname.split(".")
split_by_dot[-2] += "." + split_by_dot[-1]
split_by_dot.pop()
if len(split_by_dot) > 1:
return split_by_dot
return [fname]
def update_links_and_tags(line: str, name_to_path: dict, curr_path: str) -> str:
"""Given a line of a logseq page, updates any links and tags in it
:arg curr_path Absolute path of the current file, needed so that links can be replaced with relative paths
"""
# First replace [[Aug 24th, 2022] with [[2022-08-24]]
# This will stop the comma breaking tags
month_map = {
"Jan": "01",
"Feb": "02",
"Mar": "03",
"Apr": "04",
"May": "05",
"Jun": "06",
"Jul": "07",
"Aug": "08",
"Sep": "09",
"Oct": "10",
"Nov": "11",
"Dec": "12",
}
def reformat_dates_in_links(match: re.Match):
month = match[1]
date = match[2]
year = match[4]
if len(date) == 1:
date = "0" + date
return "[[" + year + "-" + month_map[month] + "-" + date + "]]"
line = re.sub(
r"\[\[(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\b (\d{1,2})(st|nd|rd|th), (\d{4})]]",
reformat_dates_in_links,
line,
)
# Replace #[[this type of tag]] with #this_type_of_tag or [[this type of tag]] depending on args.convert_tags_to_links
def fix_long_tag(match: re.Match):
s = match[0]
if args.convert_tags_to_links:
s = s.replace("#","")
else:
s = s.replace(" ", "_")
s = s.replace("[", "")
s = s.replace("]", "")
return s
line = re.sub(r"#\[\[.*?]]", fix_long_tag, line)
# Convert a 'short' #tag to a [[tag]] link, if args.convert_tags_to_links is true
def convert_tag_to_link(match: re.Match):
s = match[0]
if args.convert_tags_to_links:
s = s.replace("#","")
s = "[[{}]]".format(s)
return s
line = re.sub(r"#\w+", convert_tag_to_link, line)
# Replace [[This/Type/OfLink]] with [OfLink](../Type/OfLink) - for example
def fix_link(match: re.Match):
s = match[0]
s = s.replace("[", "")
s = s.replace("]", "")
# Or make it a tag if the page doesn't exist
if s not in name_to_path:
if args.convert_tags_to_links:
s = s.replace(":", ".")
return "[[" + s + "]]"
else:
s = "#" + s
s = s.replace(" ", "_")
s = s.replace(",", "_")
return s
else:
new_fpath = name_to_path[s]
relpath = os.path.relpath(new_fpath, os.path.dirname(curr_path))
relpath.replace(" ", "%20") # Obsidian does this
relpath = fix_escapes(relpath)
name = s.split("/")[-1]
s = "[" + name + "](" + relpath + ")" # TOFIX We return the []() format of link here rather than [[]] format which we do elsewhere
return s
line = re.sub(r"\[\[.*?]]", fix_link, line)
return line
def update_assets(line: str, old_path: str, new_path: str):
"""Updates embedded asset links and copies the asset
Assets are copied to the 'attachments' subfolder under the same directory as new_path is in
Images (.PNG, .JPG) are embedded. Everything else is linked to
"""
def fix_asset_embed(match: re.Match) -> str:
out = []
name = match[1]
old_relpath = match[2]
if old_relpath[:8] == "file:///":
old_relpath = old_relpath[7:]
old_relpath = old_relpath.replace("%20", " ")
old_asset_path = os.path.normpath(
os.path.join(os.path.dirname(old_path), old_relpath)
)
new_asset_path = os.path.join(
os.path.dirname(new_path), "attachments", os.path.basename(old_asset_path)
)
new_asset_dir = os.path.dirname(new_asset_path)
os.makedirs(new_asset_dir, exist_ok=True)
print("Old note path: " + old_path)
print("Old asset path: " + old_asset_path)
print("New asset path: " + new_asset_path)
try:
shutil.copyfile(old_asset_path, new_asset_path)
new_relpath = os.path.relpath(new_asset_path, os.path.dirname(new_path))
except FileNotFoundError:
print(
"Warning: copying the asset from "
+ old_asset_path
+ " to "
+ new_asset_path
+ " failed, skipping it"
)
new_relpath = old_relpath
# import ipdb; ipdb.set_trace()
if os.path.splitext(old_asset_path)[1].lower() in [".png", ".jpg", ".jpeg", ".gif"]:
out.append("!")
out.append("[" + name + "]")
out.append("(" + new_relpath + ")")
return "".join(out)
line = re.sub(r"!\[(.*?)]\((.*?)\)", fix_asset_embed, line)
return line
def update_image_dimensions(line: str) -> str:
"""Updates the dimensions of embedded images with custom height/width specified
Eg from {:height 319, :width 568}
to 
"""
def fix_image_dim(match):
return "![" + match[1] + "|" + match[3] + "](" + match[2] + ")"
line = re.sub(r"!\[(.*?)]\((.*?)\){:height \d*, :width (\d*)}", fix_image_dim, line)
return line
def is_collapsed_line(line: str) -> bool:
"""Checks if the line is a logseq artefact representing a collapsed block"""
match = re.match(r"\s*collapsed:: true\s*", line)
return match is not None
def remove_block_links_embeds(line: str) -> str:
"""Returns the line stripped of any block links or embeddings"""
line = re.sub(r"{{embed .*?}}", "", line)
line = re.sub(r"\(\(.*?\)\)", "", line)
return line
def convert_spaces_to_tabs(line: str) -> str:
"""Converts 2-4 spaces to a tab"""
line = re.sub(r" {2,4}", "\t", line)
return line
def convert_empty_line(line: str) -> str:
"""An empty line in logseq still starts with a hyphen"""
line = re.sub(r"^- *$", "", line)
return line
def add_space_after_hyphen_that_ends_line(line: str) -> str:
"""Add a space after a hyphen that ends a line"""
line = re.sub(r"-$", "- ", line)
return line
def prepend_code_block(line: str) -> list[str]:
"""Replaces a line starting a code block after a bullet point with two lines,
so that the code block is displayed correctly in Obsidian
If this line does not start a code block after a bullet point, then returns an empty list
"""
out = []
match = re.match(r"(\t*)-[ *]```(\w+)", line)
if match is not None:
tabs = match[1]
language_name = match[2]
out.append(tabs + "- " + language_name + " code block below:\n")
out.append(tabs + "```" + language_name + "\n")
INSIDE_CODE_BLOCK = True
# import ipdb; ipdb.set_trace()
return out
def escape_lt_gt(line: str) -> str:
"""Escapes < and > characters"""
# Not if we're inside a code block
if INSIDE_CODE_BLOCK:
return line
# Replace < and > with \< and \> respectively, but only if they're not at the start of the line
line = re.sub(r"(?<!^)<", r"\<", line)
line = re.sub(r"(?<!^)>", r"\>", line)
return line
def convert_todos(line: str) -> str:
# Not if we're inside a code block
if INSIDE_CODE_BLOCK:
return line
line = re.sub(r"^- DONE", "- [X]", line)
line = re.sub(r"^- TODO", "- [ ]", line)
return line
def add_bullet_before_indented_image(line: str) -> str:
"""If an image has been embedded on a new line created after shift+enter, it won't be indented in Obsidian"""
def add_bullet(match):
return match[1] + "- " + match[2]
line = re.sub(r"^(\t+)(!\[.*$)", add_bullet, line)
return line
def unindent_once(line: str) -> str:
"""Returns the line after removing one level of indentation"""
# If it starts with a tab, we can just remove it
if line.startswith("\t"):
return line[1:]
# If it starts with a "- ", we can remove that
if line.startswith("- "):
return line[2:]
return line
def fix_escapes(old_str: str) -> str:
"""Given a filename, replace url escaped characters with an acceptable character for Obsidian filenames
:arg old_str old string
"""
if old_str.find("%") < 0:
return old_str
replace_map = {
"%3A":".",
}
new_str = old_str
for escape_str in replace_map:
if new_str.find(escape_str) >= 0:
new_str = new_str.replace(escape_str,replace_map[escape_str])
return new_str
def unencode_filenames_for_links(old_str: str) -> str:
"""Given a filename, replace url escaped characters with the normal character as it would appear in a link
:arg old_str old value
"""
if old_str.find("%") < 0:
return old_str
replace_map = {
"%3A":":",
}
new_str = old_str
for escape_str in replace_map:
if new_str.find(escape_str) >= 0:
new_str = new_str.replace(escape_str,replace_map[escape_str])
return new_str
args = parser.parse_args()
old_base = args.logseq
new_base = args.output
old_to_new_paths = {}
new_to_old_paths = {}
new_paths = set()
pages_that_were_empty = set()
old_pagenames_to_new_paths = {}
# First loop: copy files to their new location, populate the maps and list of paths
if not os.path.exists(old_base) or not os.path.isdir(old_base):
raise ValueError(f"The directory '{old_base}' does not exist or is not a valid directory.")
if args.overwrite_output and os.path.exists(new_base):
shutil.rmtree(new_base)
os.makedirs(new_base, exist_ok=False)
# Copy journals pages to their own subfolder
old_journals = os.path.join(old_base, "journals")
assert os.path.isdir(old_journals)
new_journals = os.path.join(new_base, "journals")
os.mkdir(new_journals)
logging.info("Now beginning to copy the journal pages")
for fname in os.listdir(old_journals):
fpath = os.path.join(old_journals, fname)
logging.info("Now copying the journal page: " + fpath)
if os.path.isfile(fpath):
if not is_empty_markdown_file(fpath):
new_fpath = os.path.join(new_journals, fname)
if args.journal_dashes:
new_fpath = new_fpath.replace("_","-")
shutil.copyfile(fpath, new_fpath)
old_to_new_paths[fpath] = new_fpath
new_to_old_paths[new_fpath] = fpath
new_paths.add(new_fpath)
newfile = os.path.splitext(fname)[0]
old_pagenames_to_new_paths[newfile] = new_fpath
if args.journal_dashes:
old_pagenames_to_new_paths[newfile.replace("_","-")] = new_fpath
else:
pages_that_were_empty.add(fname)
# Copy other markdown files to the new base folder, creating subfolders for namespaces
old_pages = os.path.join(old_base, "pages")
assert os.path.isdir(old_pages)
logging.info("Now beginning to copy the non-journal pages")
for fname in os.listdir(old_pages):
fpath = os.path.join(old_pages, fname)
logging.info("Now copying the non-journal page: " + fpath)
if os.path.isfile(fpath) and is_markdown_file(fpath):
hierarchy = get_namespace_hierarchy(fname)
hierarchical_pagename = "/".join(hierarchy)
if is_empty_markdown_file(fpath):
pages_that_were_empty.add(fname)
else:
new_fpath = os.path.join(new_base, *hierarchy)
new_fpath = fix_escapes(new_fpath)
logging.info("Destination path: " + new_fpath)
new_dirname = os.path.split(new_fpath)[0]
os.makedirs(new_dirname, exist_ok=True)
shutil.copyfile(fpath, new_fpath)
old_to_new_paths[fpath] = new_fpath
new_to_old_paths[new_fpath] = fpath
new_paths.add(new_fpath)
old_pagename = os.path.splitext(hierarchical_pagename)[0]
old_pagenames_to_new_paths[
old_pagename
] = new_fpath
# Add mapping of unencoded filename for links
old_pagenames_to_new_paths[
unencode_filenames_for_links(old_pagename)
] = new_fpath
# Second loop: for each new file, reformat its content appropriately
for fpath in new_paths:
newlines = []
with open(fpath, "r", encoding="utf-8", errors="replace") as f:
lines = f.readlines()
# First replace the 'title:: my note' style of front matter with the Obsidian style (triple dashed)
front_matter = {}
in_front_matter = False
first_line_after_front_matter = 0
for idx, line in enumerate(lines):
match = re.match(r"(.*?)::[\s]*(.*)", line)
if match is not None:
front_matter[match[1]] = match[2]
first_line_after_front_matter = idx + 1
else:
break
if bool(front_matter):
# import ipdb; ipdb.set_trace()
newlines.append("---\n")
for key in front_matter:
if (key.find("tags") >= 0 or key.find("Tags") >= 0) and args.tag_prop_to_taglist:
# convert tags:: value1, #[[value 2]]
# to
# taglinks:
# - "[[value1]]"
# - "[[value 2]]"
tags = front_matter[key].split(",")
newlines.append("Taglinks:\n")
for tag in tags:
tag = tag.strip()
clean_tag = tag.replace("#","")
clean_tag = clean_tag.replace("[[","")
clean_tag = clean_tag.replace("]]","")
newlines.append(' - "[[' + clean_tag + ']]"' + "\n")
else:
newlines.append(key + ": " + front_matter[key] + "\n")
newlines.append("---\n")
for line in lines[first_line_after_front_matter:]:
ORIGINAL_LINE = line
# Update global state if this is the end of a code block
if INSIDE_CODE_BLOCK and line == "```\n":
INSIDE_CODE_BLOCK = False
# Ignore if the line if it's a collapsed:: true line
if is_collapsed_line(line):
continue
# Convert empty lines in logseq to empty lines in Obsidian
line = convert_empty_line(line)
# Convert 2-4 spaces to a tab
line = convert_spaces_to_tabs(line)
# Unindent once if the user requested it
if args.unindent_once:
line = unindent_once(line)
# Add a line above the start of a code block in a list
lines = prepend_code_block(line)
if len(lines) > 0:
newlines.append(lines[0])
line = lines[1]
# Update links and tags
line = update_links_and_tags(line, old_pagenames_to_new_paths, fpath)
# Update assets
line = update_assets(line, new_to_old_paths[fpath], fpath)
# Update image dimensions
line = update_image_dimensions(line)
# Remove block links and embeds
line = remove_block_links_embeds(line)
# Self-explanatory
line = add_space_after_hyphen_that_ends_line(line)
# Self-explanatory
line = convert_todos(line)
# < and > need to be escaped to show up as normal characters in Obsidian
line = escape_lt_gt(line)
# Make sure images are indented correctly
line = add_bullet_before_indented_image(line)
newlines.append(line)
with open(fpath, "w", encoding="utf-8") as f:
f.writelines(newlines)