Here is my forex factory calendar downloader. It creates a CSV file containing historical events from forexfactory.
It's in python and uses lxml, it's a good start for those who never made web scrapping before. Code is quite clean, but hasn't any real error management yet.
Also, it creates a 'raw' CSV view of what is available on the website. It's not filling out @NA data, doesn't try to be smart about the data. I intend to add some 'smart' behaviour during the import inside the SQL database.
#our month list for the URL
monthslist = ["jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", "oct", "nov", "dec"]
#sets up the browser
br = mechanize.Browser()
cj = cookielib.LWPCookieJar()
br.set_cookiejar(cj)
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
#set correct timezone
br.open(")
formindex = 0
for form in br.forms():
if "timezone.php" in form.action:
form["timezoneoffset"] = ["0"]
break
formindex += 1
br.select_form(nr=formindex)
br.submit()
def getData(html, outfile):
"""
Gets data from one page of events
"""
root = lxml.html.fromstring(html)
lines = root.find_class("calendar_row")
curWeekDay = None
curMonthDay = None
for event in lines:
date = event.xpath("td[@class='date']")[0]
#get the day of the month
weekDay = date.xpath("span")
monthDay = date.xpath("span/span")
if len(weekDay) > 0:
curWeekDay = weekDay[0].text
curMonthDay = monthDay[0].text
#get the time
time = event.xpath("td[@class='time']")[0].text if (len(event.xpath("td[@class='time']")) > 0) else ""
#get currency
currency = event.xpath("td[@class='currency']")[0].text if len(event.xpath("td[@class='currency']")) else ""
#get impact
impact = event.xpath("td[@class='impact']/span/@title")[0]\
if len(event.xpath("td[@class='impact']/span/@title")) else ""
#get name of event
nevent = event.xpath("td[@class='event']/span")[0].text if len(event.xpath("td[@class='event']/span")) > 0 else ""
#get actual
actual = event.xpath("td[@class='actual']")[0].text if len(event.xpath("td[@class='actual']")) else ""
#retry if actual is in a span (can happen if they colorize it)
if actual is None or len(actual.strip()) == 0:
actual = event.xpath("td[@class='actual']/span")[0].text if len(event.xpath("td[@class='actual']/span")) else ""
actual = actual.strip().replace("\n", " ") if actual is not None else ""
#get forecast
forecast = event.xpath("td[@class='forecast']")[0].text if len(event.xpath("td[@class='forecast']")) else ""
#retry if forecast is in a span (can happen if they colorize it)
if forecast is None or len(forecast.strip()) == 0:
forecast = event.xpath("td[@class='forecast']/span")[0].text if len(event.xpath("td[@class='forecast']/span")) else ""
forecast …