Source code for ml_investment.applications.fair_marketcap_sf1

import argparse
import os
import lightgbm as lgbm
import catboost as ctb

from typing import Optional
from urllib.request import urlretrieve
from ml_investment.utils import load_config
from ml_investment.features import QuarterlyFeatures, BaseCompanyFeatures, \
                                   FeatureMerger, DailyAggQuarterFeatures
from ml_investment.targets import QuarterlyTarget
from ml_investment.models import GroupedOOFModel, EnsembleModel, LogExpModel
from ml_investment.metrics import median_absolute_relative_error
from ml_investment.pipelines import Pipeline
from ml_investment.download_scripts import download_sf1, download_commodities

config = load_config()


URL = 'https://github.com/fartuk/ml_investment/releases/download/weights/fair_marketcap_sf1.pickle'
OUT_NAME = 'fair_marketcap_sf1'
DATA_SOURCE='sf1'
CURRENCY = 'USD'
VERBOSE = True
MAX_BACK_QUARTER = 20
MIN_BACK_QUARTER = 0
BAGGING_FRACTION = 0.7
MODEL_CNT = 20
FOLD_CNT = 5
QUARTER_COUNTS = [2, 4, 10]
AGG_DAY_COUNTS = [100, 200, 400, 800]
SCALE_MARKETCAP = ["3 - Small", "4 - Mid", "5 - Large", "6 - Mega"]
DAILY_AGG_COLUMNS = ["marketcap", "pe"]
CAT_COLUMNS = ["sector", "sicindustry"]
QUARTER_COLUMNS = [
            "revenue",
            "netinc",
            "ncf",
            "assets",
            "ebitda",
            "debt",
            "fcf",
            "gp",
            "workingcapital",
            "cashneq",
            "rnd",
            "sgna",
            "ncfx",
            "divyield",
            "currentratio",
            "netinccmn",]
COMMODITIES_CODES = [
            'LBMA/GOLD',
            'JOHNMATT/PALL',]



def _check_download_data():
    if not os.path.exists(config['sf1_data_path']):
        print('Downloading sf1 data')
        download_sf1.main()
        
    if not os.path.exists(config['commodities_data_path']):
        print('Downloading commodities data')
        download_commodities.main()        



def _create_data():
    if DATA_SOURCE == 'sf1':
        from ml_investment.data_loaders.quandl_commodities import QuandlCommoditiesData
        from ml_investment.data_loaders.sf1 import SF1BaseData, SF1DailyData, \
                                                   SF1QuarterlyData
    elif DATA_SOURCE == 'mongo':
        from ml_investment.data_loaders.mongo import SF1BaseData, SF1DailyData, \
                                    SF1QuarterlyData, QuandlCommoditiesData        
    data = {}
    data['quarterly'] = SF1QuarterlyData()
    data['base'] = SF1BaseData()
    data['daily'] = SF1DailyData()
    data['commodities'] = QuandlCommoditiesData()
    
    return data


def _create_feature():
    fc1 = QuarterlyFeatures(data_key='quarterly',
                            columns=QUARTER_COLUMNS,
                            quarter_counts=QUARTER_COUNTS,
                            max_back_quarter=MAX_BACK_QUARTER,
                            min_back_quarter=MIN_BACK_QUARTER,
                            verbose=VERBOSE)

    fc2 = BaseCompanyFeatures(data_key='base',
                              cat_columns=CAT_COLUMNS,
                              verbose=VERBOSE)

    # Daily agss on marketcap and pe is possible here because it 
    # normalized and there are no leakage.
    fc3 = DailyAggQuarterFeatures(daily_data_key='daily',
                                  quarterly_data_key='quarterly',
                                  columns=DAILY_AGG_COLUMNS,
                                  agg_day_counts=AGG_DAY_COUNTS,
                                  max_back_quarter=MAX_BACK_QUARTER,
                                  min_back_quarter=MIN_BACK_QUARTER,
                                  verbose=VERBOSE)

    fc4 = DailyAggQuarterFeatures(daily_data_key='commodities',
                                  quarterly_data_key='quarterly',
                                  columns=['price'],
                                  agg_day_counts=AGG_DAY_COUNTS,
                                  max_back_quarter=MAX_BACK_QUARTER,
                                  min_back_quarter=MIN_BACK_QUARTER,
                                  daily_index=COMMODITIES_CODES,
                                  verbose=VERBOSE)
    
    feature = FeatureMerger(fc1, fc2, on='ticker')
    feature = FeatureMerger(feature, fc3, on=['ticker', 'date'])
    feature = FeatureMerger(feature, fc4, on=['ticker', 'date'])

    return feature


def _create_target():
    target = QuarterlyTarget(data_key='quarterly',
                             col='marketcap',
                             quarter_shift=0)
    return target


def _create_model():
    base_models = [LogExpModel(lgbm.sklearn.LGBMRegressor()),
                   LogExpModel(ctb.CatBoostRegressor(verbose=False))]
                   
    ensemble = EnsembleModel(base_models=base_models, 
                             bagging_fraction=BAGGING_FRACTION,
                             model_cnt=MODEL_CNT)

    model = GroupedOOFModel(base_model=ensemble,
                            group_column='ticker',
                            fold_cnt=FOLD_CNT)
    
    return model



[docs]def FairMarketcapSF1(max_back_quarter: int=None, min_back_quarter: int=None, data_source: Optional[str]=None, pretrained: bool=True, verbose: bool=None) -> Pipeline: ''' Model is used to estimate fair company marketcap for several last quarters. Pipeline uses features from :class:`~ml_investment.features.BaseCompanyFeatures`, :class:`~ml_investment.features.QuarterlyFeatures`, :class:`~ml_investment.features.DailyAggQuarterFeatures`, :class:`~ml_investment.features.CommoditiesAggQuarterFeatures` and trained to predict real market capitalizations ( using :class:`~ml_investment.targets.QuarterlyTarget` ). Since some companies are overvalued and some are undervalued, the model makes an average "fair" prediction. :mod:`~ml_investment.data_loaders.sf1` and :mod:`~ml_investment.data_loaders.quandl_commodities` is used for loading data. Note: SF1 dataset is paid, so for using this model you need to subscribe and paste quandl token to `~/.ml_investment/secrets.json` ``quandl_api_key`` Parameters ---------- max_back_quarter: max quarter number which will be used in model min_back_quarter: min quarter number which will be used in model data_source: which data use for model. One of ['sf1', 'mongo']. If 'mongo', than data will be loaded from db, credentials specified at `~/.ml_investment/config.json`. If 'sf1' - from folder specified at ``sf1_data_path`` in `~/.ml_investment/secrets.json`. pretrained: use pretreined weights or not. Downloading directory path can be changed in `~/.ml_investment/config.json` ``models_path`` verbose: show progress or not ''' if data_source is not None: global DATA_SOURCE DATA_SOURCE = data_source if max_back_quarter is not None: global MAX_BACK_QUARTER MAX_BACK_QUARTER = max_back_quarter if min_back_quarter is not None: global MIN_BACK_QUARTER MIN_BACK_QUARTER = min_back_quarter if verbose is not None: global VERBOSE VERBOSE = verbose if DATA_SOURCE == 'sf1': _check_download_data() data = _create_data() feature = _create_feature() target = _create_target() model = _create_model() pipeline = Pipeline(feature=feature, target=target, model=model, data=data, out_name=OUT_NAME) core_path = '{}/{}.pickle'.format(config['models_path'], OUT_NAME) if pretrained: if not os.path.exists(core_path): urlretrieve(URL, core_path) pipeline.load_core(core_path) return pipeline
[docs]def main(data_source): ''' Default model training. Resulted model weights directory path can be changed in `~/.ml_investment/config.json` ``models_path`` ''' pipeline = FairMarketcapSF1(pretrained=False, data_source=data_source) base_df = pipeline.data['base'].load() tickers = base_df[(base_df['currency'] == CURRENCY) &\ (base_df['scalemarketcap'].apply(lambda x: x in SCALE_MARKETCAP)) ]['ticker'].values result = pipeline.fit(tickers, median_absolute_relative_error) print(result) path = '{}/{}'.format(config['models_path'], OUT_NAME) pipeline.export_core(path)
if __name__ == '__main__': parser = argparse.ArgumentParser() arg = parser.add_argument arg('--data_source', type=str) args = parser.parse_args() main(args.data_source)