-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCourseListToCSV.py
134 lines (124 loc) · 8.29 KB
/
CourseListToCSV.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#===============================================================================
# Name: CourseListToCSV
# Purpose: To parse the websites using AKARI Curriculum Management version 4.0.29 to show course and module content
# and extract those details into files as its not possible to do so directly in Rapidminer 5.
#
# Author: Michael O'Brien
# Requirements:
# Python 2.7.10 including pip package to install the BeautifulSoup4 and selenium modules
# Firefox web browser to be driven by python selenium to open urls and navigate
# Libraries
# BeautifulSoup4 used to extract the content from the html
# Selenium is used to parse the overlay that shows the course lists for each department
# urlib2 used to call the url's
# unicode csv to write unicode to csv files
# Created: Nov 2015
#===============================================================================
# Import the beautiful soup library
from bs4 import BeautifulSoup
# import urllib2 library to actually go get the webpage for Beautiful Soup
import urllib2
# Import Selenium and the code needed to wait for the page to load
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Import the unicode csv library installed using pip unicodecsv
import unicodecsv as csv
import sys
#Needed to write unicode to text file
import codecs
# Import my code that parses the individual course pages and the module pages listed there in
import ParseCoursePage
def main(OutputFileName="ITTCourseList.csv", FileDelimiter=";", GetCoursesFromURL='http://courses.it-tallaght.ie/',DeptListDivID='homeProgrammes', WebPageLoadDelay=10):
# Function Parameters for IT-Tallaght OutputFileName="ITTCourseList.csv", FileDelimiter=";", GetCoursesFromURL='http://courses.it-tallaght.ie/',DeptListDivID='homeProgrammes', WebPageLoadDelay=10
# Function Parameters for IT-Blanch Course: OutputFileName="ITBlanchCourseList.csv", FileDelimiter=";", GetCoursesFromURL='http://courses.itb.ie/',DeptListDivID='homeProgrammesWide', WebPageLoadDelay=10
Spacer ="\n------File Writer------\n"
TextContentsFileName ="Text/"
# Create files to store the output in (w)rite mode and add the header to the FileDelimiter specified in the function parameters
MyCSVFile = open(OutputFileName, "wb")
CourseList = csv.writer(MyCSVFile, delimiter=FileDelimiter)
# Write the 1st row to give the column names
CourseList.writerow(['Dept', 'link', 'CourseName','CourseAward', 'CourseCode', 'CourseDelivery', 'SemesterCount', 'CourseNFQLevel', 'CourseDepartment'])
URLToParse = GetCoursesFromURL
# Open the webpage using
WebContent = urllib2.urlopen(URLToParse)
#Parse the content using soup but strip out non ascii chars first
soup = BeautifulSoup(WebContent, "html.parser")
# Open the webpage using selenium
driver = webdriver.Firefox()
# Give the page time to load before continuing by waiting 5 seconds
driver.implicitly_wait(WebPageLoadDelay) # seconds
print('Trying to parse ', URLToParse ,' now')
driver.get(URLToParse)
subset = driver.find_element_by_id(DeptListDivID)
# Just get the part of the document that contains the list of department # xpath //*[(@id = "homeProgrammes")] contains the list of departments but just need the id field here
print('Finding the DIV Id', DeptListDivID, " on the webpage")
Depts = soup.find(id=DeptListDivID)
# print("Print out the nicely formatted Unicode with tags on their own line")
#print(soup.prettify())
# print("Print just the part of the doc where the id homeProgrammes was found")
# print(Depts)
for links in Depts.findAll('a'):
print(links)
# print("--------SPACER-----------------")
print('Processing Department ',links.string,' link(s) now')
# Using selenium find the link to the depts list of courses that matches the link string from beautiful soup and click it
FollowLink = subset.find_element_by_link_text(links.string)
FollowLink.click()
# Try waiting 10 seconds for the element with ID 'ProgrammeListForDepartment' is available
try:
# Get the Overlay i.e the list of the course in the div ProgrammeListForDepartment (it could also be homeProgrammesWide so check the webpage source and use the appropriate parameter
Overlay = WebDriverWait(driver, WebPageLoadDelay).until(EC.presence_of_element_located((By.ID, "ProgrammeListForDepartment")))
# Get it as a Beautiful soup object too as its easier to read
SoupOverlay = BeautifulSoup(Overlay.get_attribute('outerHTML'), "html.parser")
#print(Soup.prettify())
# close the overlay
Overlay.find_element_by_link_text("close").click()
except NoSuchElementException:
print(NoSuchElementException.msg())
# Exit now
sys.exit(1)
# loop over the links
for courselink in SoupOverlay.findAll('a'):
if courselink.get('href') != "":
FullLink = URLToParse + courselink.get('href')
# Add them to the file
# = [links.string, courselink.get_text(), FullLink];
print("--Found these non blank urls--")
print("Dept: ", links.string, " link ",FullLink," Course Name", courselink.getText())
#Parse the course link itself and its child modules
print('Getting the course details and module text for ',courselink.getText()," now")
CourseContentsDictionary = ParseCoursePage.main(FullLink, URLToParse)
print("Got the following keys", CourseContentsDictionary.keys(), " back from the parsing function")
#Use the Coursecode as the unique filename
TextContentsFileName = CourseContentsDictionary['CourseCode']
#Get the non-unicode value so u'CourseCode' don't corrupt the html when its printed to file
TextContentsFileName = str(TextContentsFileName.strip())
#Create a file with utf-8 encoding
MyHTMLFile = codecs.open(TextContentsFileName+".html", "w",encoding='utf-8')
HeaderText = "<h1> Course Outcomes for "+ TextContentsFileName +"</h1>"
MyHTMLFile.write(HeaderText)
#Add html div tags to the CourseOutcomes text and include an ID value for equal measure
EncasedCourseOutcomes = "<div id=",TextContentsFileName,">",CourseContentsDictionary['CourseOutcomes'],"</div>"
MyHTMLFile.write(EncasedCourseOutcomes.__str__())
MyHTMLFile.write("<h1> Module Content </h1>")
MyHTMLFile.write(CourseContentsDictionary['CourseModuleText'])
print("Writing the Module contents for ",TextContentsFileName," to file")
# Write the results to the file after calling the ParseCoursePage function to pull the data from that page and the module pages linked to it
print('Writing ', courselink.getText(), 'to file','TextContentsFile')
#CourseList. Row Structure (['Dept', 'link', 'CourseName','CourseAward', 'CourseCode', 'CourseDelivery', 'SemesterCount', 'CourseNFQLevel', 'CourseDepartment', 'CourseOutcomes', 'CourseModuleText'])
CourseList.writerow([links.string, FullLink, courselink.getText(),CourseContentsDictionary['CourseAward'] ,CourseContentsDictionary['CourseCode'],CourseContentsDictionary['CourseDelivery'], CourseContentsDictionary['SemesterCount'] ,CourseContentsDictionary['CourseNFQLevel'] ,CourseContentsDictionary['CourseDepartment']])
MyCSVFile.flush()
# Close the csv file
print('File', MyCSVFile.name ,' closed')
MyCSVFile.close
MyHTMLFile.close()
driver.close()
print('External Web browser closed')
# Exit successfully
sys.exit(0)
# End of Main
if __name__ == '__main__':
main()