I do just that (scrape the website).
The code isn't nice so I didn't publish it, but I guess you could use it
as a starting point. It does a bit more since I try to cache the whole
year of data and not spam them when I import prices for the full month.
It's not 100% proof since I don't cache the the first request in
'get_historical_price')
Here is how you'd specify it:
2020-01-01 commodity LU0102238812
price: "EUR:price.ft_com.fund/LU0102238812:EUR"
And here is the code (I use it for both ETFs and funds)
import datetime
import requests
import shelve
import tempfile
import re
import json
import os
from bs4 import BeautifulSoup
from beancount.core.number import D
from beancount.prices import source
def now():
return int(datetime.datetime.now(datetime.timezone.utc).timestamp())
def source_price(price, dt, currency):
if currency == 'GBX':
currency = 'GBP'
price = price / 100
return source.SourcePrice(price, dt, currency)
class BaseSource(source.Source):
uri = ""
def find_current_price(self, soup):
elements = soup.find_all('div',
class_='mod-tearsheet-overview__quote')
assert len(elements) == 1
element = elements[0]
ul = element.contents[0]
disclaimer = element.contents[1]
assert ul.name == "ul"
li = ul.contents[0]
assert li.name == "li"
currency = re.search("Price \((.*)\)",
li.contents[0].text).group(1)
price_str = li.contents[1].text
return price_str, currency
def get_latest_price(self, ticker):
response = requests.get(self.uri, {'s': ticker})
soup = BeautifulSoup(response.text, 'html.parser')
price_str, currency = self.find_current_price(soup)
element = soup.find('div', class_='mod-tearsheet-overview__quote')
disclaimer = element.contents[1]
quote_date_m = re.search(", as of (.*)$", disclaimer.text)
if quote_date_m:
print(quote_date_m.group(1))
dt = datetime.datetime.strptime(quote_date_m.group(1), "%b
%d %Y %H:%M GMT.")
dt = dt.replace(tzinfo=datetime.timezone.utc)
else:
dt = None
return source_price(D(price_str), dt, currency)
def get_historical_price(self, ticker, time):
response = requests.get(self.uri, {'s': ticker})
soup = BeautifulSoup(response.text, 'html.parser')
price_str, currency = self.find_current_price(soup)
elements = soup.find_all('div', attrs={"data-module-name" :
"HistoricalPricesApp"})
assert len(elements) == 1
symbol_id = json.loads(elements[0]['data-mod-config'])['symbol']
year = (time - datetime.timedelta(days=1)).year
temp_dir = tempfile.gettempdir()
cache_path = os.path.join(temp_dir, f'ft_com-{year}-{symbol_id}')
data_uri='https://markets.ft.com/data/equities/ajax/get-historical-prices'
args = {
'startdate': f'{year}/01/01',
'endDate': f'{year}/12/31',
'symbol': symbol_id
}
with shelve.open(cache_path) as db:
if db.get('expiry', -1) < now():
db['expiry'] = now() + (3600 * 24)
history_response = requests.get(data_uri, args).json()
db['result'] = history_response
else:
history_response = db.get('result')
history_soup = BeautifulSoup(history_response['html'],
'html.parser')
for tr in history_soup.contents:
date_str = tr.contents[0].contents[0].text
dt = datetime.datetime.strptime(date_str, "%A, %B %d, %Y")
dt = dt.replace(tzinfo=datetime.timezone.utc)
if dt <= time:
price_str = tr.contents[4].text
return source_price(D(price_str), dt, currency)
assert False
from .base import BaseSource
class Source(BaseSource):
uri = "https://markets.ft.com/data/funds/tearsheet/historical"
On 25/09/2020 09:15, Daniele Nicolodi wrote:
Hello,
I am looking for historical price data for investment funds. I found the
data for some funds only on the Financial Times website (for example
https://markets.ft.com/data/funds/tearsheet/summary?s=LU0102238812:EUR)
However, the FT does not offer a public API to download this data, which
would need to be scraped form the website. Does anyone have a better
solution?
Thank you.
Cheers,
Dan
--
You received this message because you are subscribed to the Google Groups
"Beancount" group.
To unsubscribe from this group and stop receiving emails from it, send an email
to [email protected].
To view this discussion on the web visit
https://groups.google.com/d/msgid/beancount/444ed295-93ec-535b-09a9-fb42ec7e2b71%40gmail.com.