Source code for ml_investment.data_loaders.yahoo

'''
Loader for dataset provided by yahoo.
Data may be downloaded by script
:func:`~ml_investment.download_scripts.download_yahoo.main`

Expected dataset structure:
    | path to Yahoo data folder with structure
    | Yahoo
    | ├── quarterly
    | │   ├── AAPL.csv
    | │   ├── FB.csv
    | │   └── ...
    | ├── base
    |     ├── AAPL.json
    |     ├── FB.json
    |     └── ...
'''

import json
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from typing import Optional, Union, List
from ..utils import load_json




[docs]class YahooQuarterlyData: ''' Loader for quartely fundamental information about companies(debt, revenue etc) ''' def __init__(self, data_path: str, quarter_count: Optional[int]=None): ''' Parameters ---------- data_path: path to :mod:`~ml_investment.data_loaders.yahoo` dataset folder quarter_count: maximum number of last quarters to return. Resulted number may be less due to short history in some companies ''' self.data_path = data_path self.quarter_count = quarter_count
[docs] def load(self, index: List[str]) -> pd.DataFrame: ''' Parameters ---------- index: list of tickers to load data for Returns ------- ``pd.DataFrame`` quarterly information about companies ''' result = [] for ticker in index: path = '{}/quarterly/{}.csv'.format(self.data_path, ticker) if not os.path.exists(path): continue df = pd.read_csv(path) df['ticker'] = ticker if self.quarter_count is not None: df = df[:self.quarter_count] result.append(df) if len(result) > 0: result = pd.concat(result, axis=0).reset_index(drop=True) else: return None result['date'] = result['date'].astype(np.datetime64) result = result.drop_duplicates(['ticker', 'date']) result.index = range(len(result)) return result
[docs]class YahooBaseData: ''' Loader for base information about company(like sector, industry etc) ''' def __init__(self, data_path: str): ''' Parameters ---------- data_path: path to :mod:`~ml_investment.data_loaders.yahoo` dataset folder ''' self.data_path = data_path
[docs] def load(self, index: Optional[List[str]] = None) -> pd.DataFrame: ''' Parameters ---------- index: list of tickers to load data for OR ``None`` (for loading all possible tickers) Returns ------- ``pd.DataFrame`` base companies information ''' result = [] base_path = '{}/base'.format(self.data_path) if index is None: index = [x.split('.json')[0] for x in os.listdir(base_path)] for ticker in index: path = '{}/{}.json'.format(base_path, ticker) if not os.path.exists(path): continue data = load_json(path) data['ticker'] = ticker result.append(data) if len(result) == 0: return None result = pd.DataFrame(result) return result