forked from ctu-mrs/thesis_template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.bib
428 lines (383 loc) · 14.9 KB
/
main.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
% TF-IDF
@misc{wiki_tf_idf,
author = {Wikipedia Contributors},
title = {TF-IDF - Wikipedia},
url = {https://en.wikipedia.org/wiki/Tf%E2%80%93idf},
accessdate = {2024-05-24}
}
% BoW
@misc{wiki_BoW,
author = {Wikipedia Contributors},
title = {Bag-of-words model - Wikipedia},
url = {https://en.wikipedia.org/wiki/Bag-of-words_model},
accessdate = {2024-05-24}
}
% Word2Vec
@misc{mikolov2013efficient,
title = {Efficient Estimation of Word Representations in Vector Space},
author = {Tomas Mikolov and Kai Chen and Greg Corrado and Jeffrey Dean},
year = {2013},
eprint = {1301.3781},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% GloVe
@inproceedings{pennington2014glove,
author = {Jeffrey Pennington and Richard Socher and Christopher D. Manning},
booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
title = {GloVe: Global Vectors for Word Representation},
year = {2014},
pages = {1532--1543},
url = {http://www.aclweb.org/anthology/D14-1162}
}
% FastText
@article{bojanowski2017enriching,
title = {Enriching Word Vectors with Subword Information},
author = {Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
journal = {arXiv preprint arXiv:1607.04606},
year = {2016}
}
% Transformers
@misc{vaswani2023attention,
title = {Attention Is All You Need},
author = {Ashish Vaswani and Noam Shazeer and Niki Parmar and Jakob Uszkoreit and Llion Jones and Aidan N. Gomez and Lukasz Kaiser and Illia Polosukhin},
year = {2023},
eprint = {1706.03762},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% MTEB
@misc{muennighoff2023mteb,
title = {MTEB: Massive Text Embedding Benchmark},
author = {Niklas Muennighoff and Nouamane Tazi and Loïc Magne and Nils Reimers},
year = {2023},
eprint = {2210.07316},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% Bert mBERT
@misc{devlin2019bert,
title = {BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding},
author = {Jacob Devlin and Ming-Wei Chang and Kenton Lee and Kristina Toutanova},
year = {2019},
eprint = {1810.04805},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% CZERT
@misc{sido2021czert,
title = {Czert -- Czech BERT-like Model for Language Representation},
author = {Jakub Sido and Ondřej Pražák and Pavel Přibáň and Jan Pašek and Michal Seják and Miloslav Konopík},
year = {2021},
eprint = {2103.13031},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% Seznam models
@misc{bednář2023like,
title = {Some Like It Small: Czech Semantic Embedding Models for Industry Applications},
author = {Jiří Bednář and Jakub Náplava and Petra Barančíková and Ondřej Lisický},
year = {2023},
eprint = {2311.13921},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% Distillation / DistilBERT
@misc{sanh2020distilbert,
title = {DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter},
author = {Victor Sanh and Lysandre Debut and Julien Chaumond and Thomas Wolf},
year = {2020},
eprint = {1910.01108},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% mE5
@misc{wang2024multilingual,
title = {Multilingual E5 Text Embeddings: A Technical Report},
author = {Liang Wang and Nan Yang and Xiaolong Huang and Linjun Yang and Rangan Majumder and Furu Wei},
year = {2024},
eprint = {2402.05672},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% E5
@misc{wang2024text,
title = {Text Embeddings by Weakly-Supervised Contrastive Pre-training},
author = {Liang Wang and Nan Yang and Xiaolong Huang and Binxing Jiao and Linjun Yang and Daxin Jiang and Rangan Majumder and Furu Wei},
year = {2024},
eprint = {2212.03533},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% mC4 dataset
@misc{xue2021mt5,
title = {mT5: A massively multilingual pre-trained text-to-text transformer},
author = {Linting Xue and Noah Constant and Adam Roberts and Mihir Kale and Rami Al-Rfou and Aditya Siddhant and Aditya Barua and Colin Raffel},
year = {2021},
eprint = {2010.11934},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% LaBSE
@misc{feng2022languageagnostic,
title = {Language-agnostic BERT Sentence Embedding},
author = {Fangxiaoyu Feng and Yinfei Yang and Daniel Cer and Naveen Arivazhagan and Wei Wang},
year = {2022},
eprint = {2007.01852},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% AnglE (UAE-Large-V1)
@misc{li2024angleoptimized,
title = {AnglE-optimized Text Embeddings},
author = {Xianming Li and Jing Li},
year = {2024},
eprint = {2309.12871},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% XLM-Roberta
@misc{conneau2020unsupervised,
title = {Unsupervised Cross-lingual Representation Learning at Scale},
author = {Alexis Conneau and Kartikay Khandelwal and Naman Goyal and Vishrav Chaudhary and Guillaume Wenzek and Francisco Guzmán and Edouard Grave and Myle Ott and Luke Zettlemoyer and Veselin Stoyanov},
year = {2020},
eprint = {1911.02116},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% ALBERT
@misc{lan2020albert,
title = {ALBERT: A Lite BERT for Self-supervised Learning of Language Representations},
author = {Zhenzhong Lan and Mingda Chen and Sebastian Goodman and Kevin Gimpel and Piyush Sharma and Radu Soricut},
year = {2020},
eprint = {1909.11942},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% RetroMAE
@misc{xiao2022retromae,
title = {RetroMAE: Pre-Training Retrieval-oriented Language Models Via Masked Auto-Encoder},
author = {Shitao Xiao and Zheng Liu and Yingxia Shao and Zhao Cao},
year = {2022},
eprint = {2205.12035},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% ParaCrawl
@inproceedings{espla2019paracrawl,
title = {ParaCrawl: Web-scale parallel corpora for the languages of the EU},
author = {Esplà, Miquel and Forcada, Mikel and Ramírez-Sánchez, Gema and Hoang, Hieu},
booktitle = {Proceedings of Machine Translation Summit XVII: Translator, Project and User Tracks},
pages = {118--119},
year = {2019},
organization = {European Association for Machine Translation}
}
% CzEng dataset
@misc{kocmi2020announcing,
title = {Announcing CzEng 2.0 Parallel Corpus with over 2 Gigawords},
author = {Tom Kocmi and Martin Popel and Ondrej Bojar},
year = {2020},
eprint = {2007.03006},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% SimCSE
@misc{gao2022simcse,
title = {SimCSE: Simple Contrastive Learning of Sentence Embeddings},
author = {Tianyu Gao and Xingcheng Yao and Danqi Chen},
year = {2022},
eprint = {2104.08821},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% small-e-czech
@misc{kocián2021siamese,
title = {Siamese BERT-based Model for Web Search Relevance Ranking Evaluated on a New Czech Dataset},
author = {Matěj Kocián and Jakub Náplava and Daniel Štancl and Vladimír Kadlec},
year = {2021},
eprint = {2112.01810},
archiveprefix = {arXiv},
primaryclass = {cs.IR}
}
% MS MARCO dataset
@misc{bajaj2018ms,
title = {MS MARCO: A Human Generated MAchine Reading COmprehension Dataset},
author = {Payal Bajaj and Daniel Campos and Nick Craswell and Li Deng and Jianfeng Gao and Xiaodong Liu and Rangan Majumder and Andrew McNamara and Bhaskar Mitra and Tri Nguyen and Mir Rosenberg and Xia Song and Alina Stoica and Saurabh Tiwary and Tong Wang},
year = {2018},
eprint = {1611.09268},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% NQ dataset
@article{kwiatkowski2019natural,
title = {Natural Questions: A Benchmark for Question Answering Research},
author = {Kwiatkowski, Tom and Palomaki, Jennimaria and Redfield, Olivia and Collins, Michael and Parikh, Ankur and Alberti, Chris and Epstein, Danielle and Polosukhin, Illia and Devlin, Jacob and Lee, Kenton and Toutanova, Kristina and Jones, Llion},
journal = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
pages = {3877--3886},
year = {2019}
}
% TriviaQA
@misc{joshi2017triviaqa,
title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
author = {Mandar Joshi and Eunsol Choi and Daniel S. Weld and Luke Zettlemoyer},
year = {2017},
eprint = {1705.03551},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% SQuAD
@misc{rajpurkar2016squad,
title = {SQuAD: 100,000+ Questions for Machine Comprehension of Text},
author = {Pranav Rajpurkar and Jian Zhang and Konstantin Lopyrev and Percy Liang},
year = {2016},
eprint = {1606.05250},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% Multilingual Universal Sentence Encoder for Semantic Retrieval
@misc{yang2019multilingual,
title = {Multilingual Universal Sentence Encoder for Semantic Retrieval},
author = {Yinfei Yang and Daniel Cer and Amin Ahmad and Mandy Guo and Jax Law and Noah Constant and Gustavo Hernandez Abrego and Steve Yuan and Chris Tar and Yun-Hsuan Sung and Brian Strope and Ray Kurzweil},
year = {2019},
eprint = {1907.04307},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% SentenceTransformers models
@misc{reimers2019sentencebert,
title = {Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
author = {Nils Reimers and Iryna Gurevych},
year = {2019},
eprint = {1908.10084},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% BGE
@misc{xiao2024cpack,
title = {C-Pack: Packaged Resources To Advance General Chinese Embedding},
author = {Shitao Xiao and Zheng Liu and Peitian Zhang and Niklas Muennighoff},
year = {2023},
eprint = {2309.07597},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% nomic-embed-text
@misc{nussbaum2024nomic,
title = {Nomic Embed: Training a Reproducible Long Context Text Embedder},
author = {Zach Nussbaum and John X. Morris and Brandon Duderstadt and Andriy Mulyar},
year = {2024},
eprint = {2402.01613},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% Rotary Position Embedding
@misc{su2023roformer,
title = {RoFormer: Enhanced Transformer with Rotary Position Embedding},
author = {Jianlin Su and Yu Lu and Shengfeng Pan and Ahmed Murtadha and Bo Wen and Yunfeng Liu},
year = {2023},
eprint = {2104.09864},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% SwiGLU
@misc{shazeer2020glu,
title = {GLU Variants Improve Transformer},
author = {Noam Shazeer},
year = {2020},
eprint = {2002.05202},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
% Matryoshka Representation Learning
@misc{kusupati2024matryoshka,
title = {Matryoshka Representation Learning},
author = {Aditya Kusupati and Gantavya Bhatt and Aniket Rege and Matthew Wallingford and Aditya Sinha and Vivek Ramanujan and William Howard-Snyder and Kaifeng Chen and Sham Kakade and Prateek Jain and Ali Farhadi},
year = {2024},
eprint = {2205.13147},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
% Retrieval-Augmented Generation
@misc{lewis2021retrievalaugmented,
title = {Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks},
author = {Patrick Lewis and Ethan Perez and Aleksandra Piktus and Fabio Petroni and Vladimir Karpukhin and Naman Goyal and Heinrich Küttler and Mike Lewis and Wen-tau Yih and Tim Rocktäschel and Sebastian Riedel and Douwe Kiela},
year = {2021},
eprint = {2005.11401},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% mxbai-embed-large-v1
@online{emb2024mxbai,
title = {Open Source Strikes Bread - New Fluffy Embeddings Model},
author = {Sean Lee and Aamir Shakir and Darius Koenig and Julius Lipp},
year = {2024},
url = {https://www.mixedbread.ai/blog/mxbai-embed-large-v1}
}
% mxbai-embed-2d-large-v1
@online{emb2024mxbai2d,
title = {Fresh 2D-Matryoshka Embedding Model},
author = {Sean Lee and Aamir Shakir and Julius Lipp and Darius Koenig},
year = {2024},
url = {https://www.mixedbread.ai/blog/mxbai-embed-2d-large-v1}
}
% ESE: Espresso Sentence Embeddings
@misc{li2024ese,
title = {ESE: Espresso Sentence Embeddings},
author = {Xianming Li and Zongxi Li and Jing Li and Haoran Xie and Qing Li},
year = {2024},
eprint = {2402.14776},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% SetFit
@misc{tunstall2022efficient,
title = {Efficient Few-Shot Learning Without Prompts},
author = {Lewis Tunstall and Nils Reimers and Unso Eun Seo Jo and Luke Bates and Daniel Korat and Moshe Wasserblat and Oren Pereg},
year = {2022},
eprint = {2209.11055},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% MEDI dataset
@misc{su2023embedder,
title = {One Embedder, Any Task: Instruction-Finetuned Text Embeddings},
author = {Hongjin Su and Weijia Shi and Jungo Kasai and Yizhong Wang and Yushi Hu and Mari Ostendorf and Wen-tau Yih and Noah A. Smith and Luke Zettlemoyer and Tao Yu},
year = {2023},
eprint = {2212.09741},
archiveprefix = {arXiv},
primaryclass = {cs.CL}
}
% GIST Embedding v0
@misc{solatorio2024gistembed,
title = {GISTEmbed: Guided In-sample Selection of Training Negatives for Text Embedding Fine-tuning},
author = {Aivin V. Solatorio},
year = {2024},
eprint = {2402.16829},
archiveprefix = {arXiv},
primaryclass = {cs.LG}
}
% RAG with different embeddings
@misc{joshi2024RAGemb,
title = {RAG Series Part 1: How to Choose the Right Embedding Model for Your Application},
author = {Apoorva Joshi},
year = {2024},
url = {https://www.mongodb.com/developer/products/atlas/choose-embedding-model-rag/}
}
% RAG with different chunks
@article{theja2023RAGchunk,
title = {Evaluating the Ideal Chunk Size for a RAG System using LlamaIndex},
author = {Ravi Theja},
journal = {LLAMAi},
url = {https://www.llamaindex.ai/blog/evaluating-the-ideal-chunk-size-for-a-rag-system-using-llamaindex-6207e5d3fec5}
}
% Common Crawl
@misc{commoncrawl,
title = {Common Crawl},
url = {https://commoncrawl.org/}
}
% Umar Jamil YouTube Channel
@misc{umarjamilai,
title = {Umar Jamil [YouTube Channel]},
author = {Umar Jamil},
url = {https://www.youtube.com/@umarjamilai}
}