I do just that (scrape the website).
The code isn't nice so I didn't publish it, but I guess you could use it as a starting point. It does a bit more since I try to cache the whole year of data and not spam them when I import prices for the full month. It's not 100% proof since I don't cache the the first request in 'get_historical_price')

Here is how you'd specify it:


2020-01-01 commodity LU0102238812
  price: "EUR:price.ft_com.fund/LU0102238812:EUR"

And here is the code (I use it for both ETFs and funds)

import datetime
import requests
import shelve
import tempfile
import re
import json
import os
from bs4 import BeautifulSoup

from beancount.core.number import D
from beancount.prices import source

def now():
    return int(datetime.datetime.now(datetime.timezone.utc).timestamp())

def source_price(price, dt, currency):
    if currency == 'GBX':
        currency = 'GBP'
        price = price / 100
    return source.SourcePrice(price, dt, currency)

class BaseSource(source.Source):

    uri = ""

    def find_current_price(self, soup):
        elements = soup.find_all('div', class_='mod-tearsheet-overview__quote')
        assert len(elements) == 1
        element = elements[0]
        ul = element.contents[0]
        disclaimer = element.contents[1]
        assert ul.name == "ul"
        li = ul.contents[0]
        assert li.name == "li"
        currency = re.search("Price \((.*)\)", li.contents[0].text).group(1)
        price_str = li.contents[1].text
        return price_str, currency


    def get_latest_price(self, ticker):
        response = requests.get(self.uri, {'s': ticker})

        soup = BeautifulSoup(response.text, 'html.parser')
        price_str, currency = self.find_current_price(soup)
        element = soup.find('div', class_='mod-tearsheet-overview__quote')
        disclaimer = element.contents[1]

        quote_date_m = re.search(", as of (.*)$", disclaimer.text)
        if quote_date_m:
            print(quote_date_m.group(1))
            dt = datetime.datetime.strptime(quote_date_m.group(1), "%b %d %Y %H:%M GMT.")
            dt = dt.replace(tzinfo=datetime.timezone.utc)
        else:
            dt = None

        return source_price(D(price_str), dt, currency)

    def get_historical_price(self, ticker, time):
        response = requests.get(self.uri, {'s': ticker})

        soup = BeautifulSoup(response.text, 'html.parser')
        price_str, currency = self.find_current_price(soup)
        elements = soup.find_all('div', attrs={"data-module-name" : "HistoricalPricesApp"})
        assert len(elements) == 1
        symbol_id = json.loads(elements[0]['data-mod-config'])['symbol']
        year = (time - datetime.timedelta(days=1)).year

        temp_dir = tempfile.gettempdir()
        cache_path = os.path.join(temp_dir, f'ft_com-{year}-{symbol_id}')
data_uri='https://markets.ft.com/data/equities/ajax/get-historical-prices'
        args = {
            'startdate': f'{year}/01/01',
            'endDate': f'{year}/12/31',
            'symbol': symbol_id
        }
        with shelve.open(cache_path) as db:
            if db.get('expiry', -1) < now():
                db['expiry'] = now() + (3600 * 24)
                history_response = requests.get(data_uri, args).json()
                db['result'] = history_response
            else:
                history_response = db.get('result')

        history_soup = BeautifulSoup(history_response['html'], 'html.parser')
        for tr in history_soup.contents:
            date_str = tr.contents[0].contents[0].text
            dt = datetime.datetime.strptime(date_str, "%A, %B %d, %Y")
            dt = dt.replace(tzinfo=datetime.timezone.utc)
            if dt <= time:
                price_str = tr.contents[4].text
                return source_price(D(price_str), dt, currency)
        assert False


from .base import BaseSource

class Source(BaseSource):
    uri = "https://markets.ft.com/data/funds/tearsheet/historical";

On 25/09/2020 09:15, Daniele Nicolodi wrote:
Hello,

I am looking for historical price data for investment funds. I found the
data for some funds only on the Financial Times website (for example
https://markets.ft.com/data/funds/tearsheet/summary?s=LU0102238812:EUR)
However, the FT does not offer a public API to download this data, which
would need to be scraped form the website. Does anyone have a better
solution?

Thank you.

Cheers,
Dan


--
You received this message because you are subscribed to the Google Groups 
"Beancount" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to [email protected].
To view this discussion on the web visit 
https://groups.google.com/d/msgid/beancount/444ed295-93ec-535b-09a9-fb42ec7e2b71%40gmail.com.

Reply via email to