Can now also AJAX download and show abstracts for every paper at user…

… will
karpathy · Nov 26, 2012 · 3d4bb82 · 3d4bb82
1 parent ff7ac77
commit 3d4bb82
Show file tree

Hide file tree

Showing 5 changed files with 113 additions and 7 deletions.
diff --git a/Readme.md b/Readme.md
@@ -1,15 +1,15 @@
 
 # NIPS papers pretty html
 
-This is a set of scripts for creating nice preview page (see here: http://cs.stanford.edu/~karpathy/nipspreview/ ) for all papers published at NIPS. I hope these scripts can be useful to others to create similar pages for other conferences. They show how one can manipulate PDFs, extract image thumbnails, analyze word frequencies, etc.
+This is a set of scripts for creating nice preview page (see here: http://cs.stanford.edu/~karpathy/nipspreview/ ) for all papers published at NIPS. I hope these scripts can be useful to others to create similar pages for other conferences. They show how one can manipulate PDFs, extract image thumbnails, analyze word frequencies, do AJAX requests to load abstracts, etc.
 
 #### Installation
 
-0. Clone this repository to $FOLDER `git clone https://github.com/karpathy/nipspreview.git`
+0. Clone this repository `git clone https://github.com/karpathy/nipspreview.git`
 
-1. Download nips25offline from `http://books.nips.cc/nips25.html` and move it into $FOLDER.
+1. Download nips25offline from `http://books.nips.cc/nips25.html` and move it into the folder created in step 0
 
-2. Install ImageMagick: `sudo apt-get install imagemagick`
+2. Make sure you have ImageMagick: `sudo apt-get install imagemagick`
 
 3. Run `pdftowordcloud.py` (to generate top words for each paper. Output saved in topwords.p as pickle)
 
@@ -21,7 +21,9 @@ This is a set of scripts for creating nice preview page (see here: http://cs.sta
 
 7. Run `python lda.py -f allpapers.txt -k 7 --alpha=0.5 --beta=0.5 -i 100` . This will generate a pickle file called `ldaphi.p` that contains the LDA word distribution matrix. Thanks to this [nice LDA code](https://github.com/shuyo/iir/blob/master/lda/lda.py) by shuyo! It requires nltk library and numpy. In this example we are using 7 categories. You would need to change the `nipsnice_template.html` file a bit if you wanted to try different number of categories.
 
-8. Finally, run `generatenicelda.py` (to create the nipsnice.html page)
+8. Generate the abstract files inside abstracts/ folder using `getabstracts.py`. Some user interaction may be necessary because of poorly formatted papers that make abstract extraction a pain. See script.
+
+9. Finally, run `generatenicelda.py` (to create the nipsnice.html page)
 
 #### Licence
 

diff --git a/abstracts/dummy.txt b/abstracts/dummy.txt
diff --git a/generatenicelda.py b/generatenicelda.py
@@ -58,6 +58,7 @@
 js = "ldadist=["
 js2 = "pairdists=["
 for pid, p in enumerate(paperdict):
+	# pid goes 1...N, p are the keys, pointing to actual paper IDs as given by NIPS, ~1...1500 with gaps
 
 	# get title, author
 	title, author = paperdict[p]
@@ -113,13 +114,15 @@
 		<a href="%s">[pdf] </a>
 		<a href="%s">[bibtex] </a>
 		<a href="%s">[supplementary]<br /></a>
-		<span class="sim" id="sim%d">[rank by tf-idf similarity to this]</span>
+		<span class="sim" id="sim%d">[rank by tf-idf similarity to this]</span><br />
+		<span class="abstr" id="ab%d">[abstract]</span>
 	</div>
 	<img src = "%s"><br />
+	<div class = "abstrholder" id="abholder%d"></div>
 	<span class="tt">%s</span>
 	</div>
 
-	""" % (pid, title, author, pdflink, bibtexlink, supplink, pid, thumbpath, tcat)
+	""" % (pid, title, author, pdflink, bibtexlink, supplink, pid, int(p), thumbpath, int(p), tcat)
 
 
 newhtml = html.replace("RESULTTABLE", s)

diff --git a/getabstracts.py b/getabstracts.py
@@ -0,0 +1,62 @@
+# attempts to extract abstracts from the .pdfs 
+# this is a tricky process and will fail for some papers, in which
+# case the script tries to alert the user
+# this file is super hacky I'd adwise careful use
+
+# output is a set of files inside abstracts/ folder , one for each paper
+
+import os
+from string import punctuation
+from operator import itemgetter
+
+# get list of all PDFs supplied by NIPS
+relpath = "nips25offline/content/"
+allFiles = os.listdir(relpath)
+pdfs = [x for x in allFiles if x.endswith(".pdf")]
+
+for i,f in enumerate(pdfs):
+	paperid = f[9:-4]
+	fullpath = relpath + f
+
+	print "processing %s, %d/%d" % (paperid, i, len(pdfs))
+
+	# create text file from the pdf contet
+	cmd = "pdftotext %s %s" % (fullpath, "out.txt")
+	#print "EXEC: " + cmd
+	os.system(cmd)
+
+	txt = open("out.txt").read()
+	L = txt.split("\n")
+
+	# basically, we find the line that says Abstract
+	# and then go down until we see an empty line
+	# i couldn't find a better way because there is a lot
+	# of variation. On NIPS 2012 papers, this fails for 
+	# 2 badly formatted papers, in which case the script
+	# alerts the user, I had to go in open that file manually
+	# and fix the abstract text inside it.
+	print "----------"
+	i=0
+	while (not L[i]=="Abstract") and i<len(L): i+=1
+	str = []
+	while (not L[i]=="") and i<len(L): 
+		i+=1
+		str.append(L[i])
+	abstract = " ".join(str)
+	print abstract
+	print len(abstract)
+	print "----------"
+
+	f = open("abstracts/a%d.txt" % (int(paperid), ), "w")
+	f.write(abstract);
+	f.close()
+
+	# suspicious: this abstract is too long. Maybe its right,
+	# let the user deicde. If not, user can go in and fix the
+	# mistake manually
+	if len(abstract) > 2000:
+		print txt[:3000]
+		print paperid
+		a= raw_input()
+
+
diff --git a/nipsnice_template.html b/nipsnice_template.html
@@ -107,6 +107,21 @@
 	text-decoration: underline;
 }
 
+.abstr {
+	cursor: pointer;
+	text-decoration: underline;
+}
+
+.abstrholder {
+	background-color: #DFD;
+	border: 1px solid #BDB;
+	font-size: 12px;
+	padding: 10px;
+	border-radius: 5px;
+	display: none; /* so that these are hidden initially */
+	margin-bottom: 5px;
+}
+
 </style>
 
 <script src="jquery-1.8.3.min.js"></script>
@@ -229,6 +244,30 @@
 		// also scroll to top
 		$('html, body').animate({ scrollTop: 0 }, 'fast');
 	});
+
+	// user clicks on "abstract button for some paper
+	$(".abstr").click(function() {
+		var pid = parseInt($(this).attr('id').substring(2)); // id of the paper clicked
+		var aurl = "abstracts/a" + pid + ".txt";
+		var holderdiv = "#abholder" + pid;
+
+		if($(holderdiv).is(':visible')) {
+
+			$(holderdiv).slideUp(); // hide the abstract away
+
+		} else {
+
+			// do ajax request and fill the abstract div with the result
+			$.ajax({
+	            url : aurl,
+	            dataType: "text",
+	            success : function (data) {
+	                $(holderdiv).html(data);
+	                $(holderdiv).slideDown();
+	            }
+	        });	
+		}
+	});
 });
 
 </script>