Source code for ml_investment.data_loaders.sf1

'''
Loaders for dataset provided by https://www.quandl.com/databases/SF1/data. Data may be downloaded by script 
:func:`~ml_investment.download_scripts.download_sf1.main`

Expected structure of dataset
    | SF1
    | ├── core_fundamental
    | │   ├── AAPL.json
    | │   ├── FB.json
    | │   └── ...
    | ├── daily
    | │   ├── AAPL.json
    | │   ├── FB.json
    | │   └── ...
    | └── tickers.zip       
'''

import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Optional, Union, List
from ..utils import load_json, load_config




def _load_df(json_path: str) -> pd.DataFrame:
    '''
    Load data from SF1 .json file and convert it to pd.DataFrame
    
    Parameters
    ----------
    json_path:
        path to SF1 .json path
        
    Returns
    -------
        ``pd.DataFrame`` content of file
    '''
    data = load_json(json_path)
    df = pd.DataFrame(data['datatable']['data'])
    if len(df) == 0:
        columns = [x['name'] for x in data['datatable']['columns']]
        df = pd.DataFrame(columns=columns, dtype=object)
    else:
        df.columns = [x['name'] for x in data['datatable']['columns']]

    df = df.infer_objects()
    
    return df



[docs]def translate_currency(df: pd.DataFrame, columns: Optional[List[str]] = None): ''' Translate currency of columns to USD according course information in appropriate columns(like debtusd-debt) Parameters ---------- df: quarterly-based data columns: columns to translate currency Returns ------- ``pd.DataFrame`` result with the same columns and shapes but with converted currency in columns ''' if columns is None: no_translate_cols = ['ticker', 'dimension', 'calendardate', 'datekey', 'reportperiod', 'date', 'marketcap', 'lastupdated'] no_translate_cols += [x for x in df.columns if 'usd' in x] columns = [x for x in df.columns if x not in no_translate_cols] df = df.infer_objects() usd_cols = ['equityusd','epsusd','revenueusd','netinccmnusd', 'cashnequsd','debtusd','ebitusd','ebitdausd'] usd_cols = list(set(df.columns).intersection(set(usd_cols))) assert len(usd_cols) > 0 rows = np.array([(df[col.replace('usd', '')] / df[col]).values for col in usd_cols]) df['trans_currency'] = np.nanmax(rows, axis=0).astype('float32') df['trans_currency'] = df['trans_currency'].interpolate() for col in columns: df[col] = df[col] / df['trans_currency'] return df
[docs]class SF1BaseData: ''' Load base information about company(like sector, industry etc) ''' def __init__(self, data_path: Optional[str]=None): ''' Parameters ---------- data_path: path to :mod:`~ml_investment.data_loaders.sf1` dataset folder If None, than will be used ``sf1_data_path`` from `~/.ml_investment/config.json` ''' if data_path is None: data_path = load_config()['sf1_data_path'] self.data_path = data_path
[docs] def load(self, index: Optional[List[str]]=None) -> pd.DataFrame: ''' Parameters ---------- index: list of ticker to load data for, i.e. ``['AAPL', 'TSLA']`` OR ``None`` (loading for all possible tickers) Returns ------- ``pd.DataFrame`` base companies information ''' path = '{}/tickers.zip'.format(self.data_path) tickers_df = pd.read_csv(path) tickers_df = tickers_df[tickers_df['table'] == 'SF1'] if index is not None: tmp = pd.DataFrame() tmp['ticker'] = index tmp['flag'] = True tickers_df = pd.merge(tickers_df, tmp, on='ticker', how='left') tickers_df['flag'] = tickers_df['flag'].fillna(False) tickers_df = tickers_df[tickers_df['flag']] del tickers_df['flag'] return tickers_df.reset_index(drop=True)
[docs] def existing_index(self): ''' Returns ------- ``List`` existing index values that can pe pushed to `load` ''' path = '{}/tickers.zip'.format(self.data_path) tickers_df = pd.read_csv(path) tickers_df = tickers_df[tickers_df['table'] == 'SF1'] index = list(tickers_df['ticker'].unique()) return index
[docs]class SF1QuarterlyData: ''' Loader for quartely fundamental information about companies(debt, revenue etc) ''' def __init__(self, data_path: Optional[str]=None, quarter_count: Optional[int]=None, dimension: Optional[str]='ARQ'): ''' Parameters ---------- data_path: path to :mod:`~ml_investment.data_loaders.sf1` dataset folder If None, than will be used ``sf1_data_path`` from `~/.ml_investment/config.json` quarter_count: maximum number of last quarters to return. Resulted number may be less due to short history in some companies dimension: one of ``['MRY', 'MRT', 'MRQ', 'ARY', 'ART', 'ARQ']``. SF1 dataset-based parameter ''' if data_path is None: data_path = load_config()['sf1_data_path'] self.data_path = data_path self.quarter_count = quarter_count self.dimension = dimension
[docs] def load(self, index: List[str]) -> pd.DataFrame: ''' Parameters ---------- index: list of tickers to load data for, i.e. ``['AAPL', 'TSLA']`` Returns ------- ``pd.DataFrame`` quarterly information about companies ''' result = [] for ticker in index: path = '{}/core_fundamental/{}.json'.format(self.data_path, ticker) if not os.path.exists(path): continue df = _load_df(path) df = df[df['dimension'] == self.dimension] if self.quarter_count is not None: df = df[:self.quarter_count] df['date'] = df['datekey'] df = df.sort_values('date', ascending=False) #df = translate_currency(df) result.append(df) if len(result) == 0: return None result = pd.concat(result, axis=0).reset_index(drop=True) result = result.infer_objects() result['date'] = result['date'].astype(np.datetime64) return result
[docs] def existing_index(self): ''' Returns ------- ``List`` existing index values that can pe pushed to `load` ''' dir_path = '{}/core_fundamental'.format(self.data_path) index = [x.split('.json')[0] for x in os.listdir(dir_path)] return index
[docs]class SF1DailyData(): ''' Load daily information about company(marketcap, pe etc) ''' def __init__(self, data_path: Optional[str]=None, days_count: Optional[int]=None): ''' Parameters ---------- data_path: path to :mod:`~ml_investment.data_loaders.sf1` dataset folder If None, than will be used ``sf1_data_path`` from `~/.ml_investment/config.json` days_count: maximum number of last days to return. Resulted number may be less due to short history in some companies ''' if data_path is None: data_path = load_config()['sf1_data_path'] if days_count is None: days_count = int(1e5) self.data_path = data_path self.days_count = days_count
[docs] def load(self, index: List[str]) -> pd.DataFrame: ''' Parameters ---------- index: list of ticker to load data for, i.e. ``['AAPL', 'TSLA']`` Returns ------- ``pd.DataFrame`` daily information about companies ''' result = [] for ticker in index: path = '{}/daily/{}.json'.format(self.data_path, ticker) if not os.path.exists(path): continue daily_df = _load_df(path)[:self.days_count] result.append(daily_df) if len(result) == 0: return None result = pd.concat(result, axis=0).reset_index(drop=True) result = result.infer_objects() result['marketcap'] = result['marketcap'].astype(float) * 1e6 result['date'] = result['date'].astype(np.datetime64) return result
[docs] def existing_index(self): ''' Returns ------- ``List`` existing index values that can pe pushed to `load` ''' dir_path = '{}/daily'.format(self.data_path) index = [x.split('.json')[0] for x in os.listdir(dir_path)] return index
[docs]class SF1SNP500Data: ''' S&P500 historical constituents ''' def __init__(self, data_path: Optional[str]=None): ''' Parameters ---------- data_path: path to :mod:`~ml_investment.data_loaders.sf1` dataset folder If None, than will be used ``sf1_data_path`` from `~/.ml_investment/config.json` ''' if data_path is None: data_path = load_config()['sf1_data_path'] self.data_path = data_path
[docs] def load(self, index: Optional[List[np.datetime64]]=None) -> pd.DataFrame: ''' Parameters ---------- index: list of dates to load constituents for, i.e. ``[np.datetime64('2018-01-01'), np.datetime64('2018-05-10')]`` If there are no such date, than nearest past date will be used. OR ``None`` (loading for all dates when constituents was changed) Returns ------- ``pd.DataFrame`` constituents information ''' path = '{}/snp500.zip'.format(self.data_path) df = pd.read_csv(path) df['date'] = df['date'].astype(np.datetime64) df = df[df['action'] == 'historical'] if index is None: return df result = [] for date in index: tmp = df[(df['date'] <= date)] result.append(tmp[tmp['date'] == tmp['date'].max()]) if len(result) == 0: return result = pd.concat(result, axis=0) return result
[docs] def existing_index(self): ''' Returns ------- ``List`` existing index values that can pe pushed to `load` ''' path = '{}/snp500.zip'.format(self.data_path) df = pd.read_csv(path) df['date'] = df['date'].astype(np.datetime64) df = df[df['action'] == 'historical'] index = list(df['date'].unique()) return index