-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathordbog.py
executable file
·149 lines (124 loc) · 4.35 KB
/
ordbog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python2.6
# coding: utf-8
import sqlite3
import re
import sys, os
from optparse import OptionParser
from groparser import parse_entry
DICTIONARIES = {
'en': 'Engelsk',
'de': 'Tysk',
'fr': 'Fransk',
'es': 'Spansk',
'se': 'Svensk',
}
def print_entries(raw_entries):
"""Nasty, semi-succesfuld formatering af rå artikeldata
Du har ikke lyst til at vide hvad der foregår her!"""
for entry in raw_entries:
entry = entry[8:]
entry = entry.split('\0')[-2]
text = entry
text = re.sub(r'<h3>.*?</h3>', '\n', text)
text = re.sub(r' *<(?!/ol|ol|ul|/ul|li|/li).*?>', ' ', text)
text = re.sub(r'<(/?\w+).*?>', r'<\1>', text)
text = re.sub(r' +', ' ', text)
text = re.sub(r'^\s*', '', text)
text = re.sub(r'(?<=>) *| *(?=<)', '', text)
i = 0
output = ""
indentwidth = 4
indent = -1
while i < len(text):
if text[i:].startswith(r'<ol>') or text[i:].startswith(r'<ul>'):
indent += 1
i += 4
elif text[i:].startswith(r'</ol>') or text[i:].startswith(r'</ul>'):
indent -= 1
i += 5
elif text[i:].startswith(r'<li>'):
output += '\n' + ' '*indentwidth*indent
i += 4
elif text[i:].startswith(r'</li>'):
i += 5
else:
output += text[i]
i += 1
output = output.replace('<', '<')
output = output.replace('>', '>')
print output
def search(db, dictionary_path, search_terms, table, fromDanish):
"""Slå søgetermerne op i ordbogen og hent de matchende artikler.
Funktionen returnerer en liste af rå artikeldata for hvert artikelmatch
db = database
search_terms = liste af søgetermer
table = navn på tabel i databasen, hvor søgetermerne skal findes
fromDanish = hvilken retningen slår vi op i ordbogen?"""
# find id for matchende artikler
fromDanish = 1 if fromDanish else 2;
first_term = True
for term in search_terms:
rows = list(db.execute('select * from %s%i where word_ like \'%s\''
%(table,fromDanish,term)))
term_entry_ids = [r[0] for r in rows]
if first_term:
first_term = False
entry_ids = term_entry_ids
else:
entry_ids = set(term_entry_ids) & set(entry_ids)
# hent artikeldata
raw_entries = []
for entry_id in entry_ids:
rows = list(db.execute('select * from entries%i where id_ = %d'
%(fromDanish, entry_id)))
for _, entry_type, link_id, offset, nbyte in rows:
raw_entry = parse_entry(dictionary_path+'.dat',entry_id, offset, nbyte)
raw_entries.append(raw_entry)
return raw_entries
def lookup(dictionary_path, search_terms, foreign_language):
"""Foretag søgning i ordbogen
Funktionen printer de fundne resultater til terminalen
dictionary_path = sti til ordbogsfilerne (uden .dat og .gdd suffix)
search_terms = liste af søgetermer
foreign_language = tekstreng med det fremmede sprogs navn"""
db = sqlite3.connect(dictionary_path+'.gdd')
search_terms = [s.replace('*','%').replace('\'',' ').replace('.','%')
for s in search_terms]
print '=== Opslagsord fra Dansk-%s:'%foreign_language
raw_entries = search(db, dictionary_path, search_terms, 'lookup', True)
if raw_entries:
print_entries(raw_entries)
else:
print '(intet fundet)'
print '\n\n=== Opslagsord fra %s-Dansk:'%foreign_language
raw_entries = search(db, dictionary_path, search_terms, 'lookup', False)
if raw_entries:
print_entries(raw_entries)
else:
print '(intet fundet)'
# Bemærk at vi ikke slår ordforbindelser op. Dette gøres ved at benytte
# tabellerne 'collocation_lookup'. Derudover kan man slå bagvendt op i
# ordbogen ved at benytte tabellerne 'reverse'. Se mere i databasefilen!
def main():
# parse programargumenter
usage = u'Usage: %prog [options] SEARCH_STRING'
description=u'Blå og brugervenlig ordbog'
parser = OptionParser(description=description, usage=usage)
parser.add_option('-l', '--lang', nargs=1,
default='en', dest='lang', choices=DICTIONARIES.keys(),
help=u'language (en, de, fr, es, se)')
parser.add_option('-p', '--path', nargs=1, default='./',
dest='path', help='path to data files (.dat and .gdd)')
args,search_terms = parser.parse_args()
if len(search_terms) == 0:
parser.print_help()
sys.exit(1)
language = DICTIONARIES[args.lang]
dictionary_path = os.path.join(args.path,language+'Ordbog')
if not os.path.isfile(dictionary_path+'.gdd'):
print "Fejl, ordbogsdata ikke fundet: " + dictionary_path+'[.gdd|.dat]'
sys.exit(1)
# søg!
lookup(dictionary_path, search_terms, language)
if __name__ == '__main__':
main()