import argparse
import os
import lightgbm as lgbm
import catboost as ctb
from typing import Optional
from urllib.request import urlretrieve
from ml_investment.utils import load_config
from ml_investment.features import QuarterlyFeatures, BaseCompanyFeatures, \
FeatureMerger, DailyAggQuarterFeatures
from ml_investment.targets import QuarterlyTarget
from ml_investment.models import GroupedOOFModel, EnsembleModel, LogExpModel
from ml_investment.metrics import median_absolute_relative_error
from ml_investment.pipelines import Pipeline
from ml_investment.download_scripts import download_sf1, download_commodities
config = load_config()
URL = 'https://github.com/fartuk/ml_investment/releases/download/weights/fair_marketcap_sf1.pickle'
OUT_NAME = 'fair_marketcap_sf1'
DATA_SOURCE='sf1'
CURRENCY = 'USD'
VERBOSE = True
MAX_BACK_QUARTER = 20
MIN_BACK_QUARTER = 0
BAGGING_FRACTION = 0.7
MODEL_CNT = 20
FOLD_CNT = 5
QUARTER_COUNTS = [2, 4, 10]
AGG_DAY_COUNTS = [100, 200, 400, 800]
SCALE_MARKETCAP = ["3 - Small", "4 - Mid", "5 - Large", "6 - Mega"]
DAILY_AGG_COLUMNS = ["marketcap", "pe"]
CAT_COLUMNS = ["sector", "sicindustry"]
QUARTER_COLUMNS = [
"revenue",
"netinc",
"ncf",
"assets",
"ebitda",
"debt",
"fcf",
"gp",
"workingcapital",
"cashneq",
"rnd",
"sgna",
"ncfx",
"divyield",
"currentratio",
"netinccmn",]
COMMODITIES_CODES = [
'LBMA/GOLD',
'JOHNMATT/PALL',]
def _check_download_data():
if not os.path.exists(config['sf1_data_path']):
print('Downloading sf1 data')
download_sf1.main()
if not os.path.exists(config['commodities_data_path']):
print('Downloading commodities data')
download_commodities.main()
def _create_data():
if DATA_SOURCE == 'sf1':
from ml_investment.data_loaders.quandl_commodities import QuandlCommoditiesData
from ml_investment.data_loaders.sf1 import SF1BaseData, SF1DailyData, \
SF1QuarterlyData
elif DATA_SOURCE == 'mongo':
from ml_investment.data_loaders.mongo import SF1BaseData, SF1DailyData, \
SF1QuarterlyData, QuandlCommoditiesData
data = {}
data['quarterly'] = SF1QuarterlyData()
data['base'] = SF1BaseData()
data['daily'] = SF1DailyData()
data['commodities'] = QuandlCommoditiesData()
return data
def _create_feature():
fc1 = QuarterlyFeatures(data_key='quarterly',
columns=QUARTER_COLUMNS,
quarter_counts=QUARTER_COUNTS,
max_back_quarter=MAX_BACK_QUARTER,
min_back_quarter=MIN_BACK_QUARTER,
verbose=VERBOSE)
fc2 = BaseCompanyFeatures(data_key='base',
cat_columns=CAT_COLUMNS,
verbose=VERBOSE)
# Daily agss on marketcap and pe is possible here because it
# normalized and there are no leakage.
fc3 = DailyAggQuarterFeatures(daily_data_key='daily',
quarterly_data_key='quarterly',
columns=DAILY_AGG_COLUMNS,
agg_day_counts=AGG_DAY_COUNTS,
max_back_quarter=MAX_BACK_QUARTER,
min_back_quarter=MIN_BACK_QUARTER,
verbose=VERBOSE)
fc4 = DailyAggQuarterFeatures(daily_data_key='commodities',
quarterly_data_key='quarterly',
columns=['price'],
agg_day_counts=AGG_DAY_COUNTS,
max_back_quarter=MAX_BACK_QUARTER,
min_back_quarter=MIN_BACK_QUARTER,
daily_index=COMMODITIES_CODES,
verbose=VERBOSE)
feature = FeatureMerger(fc1, fc2, on='ticker')
feature = FeatureMerger(feature, fc3, on=['ticker', 'date'])
feature = FeatureMerger(feature, fc4, on=['ticker', 'date'])
return feature
def _create_target():
target = QuarterlyTarget(data_key='quarterly',
col='marketcap',
quarter_shift=0)
return target
def _create_model():
base_models = [LogExpModel(lgbm.sklearn.LGBMRegressor()),
LogExpModel(ctb.CatBoostRegressor(verbose=False))]
ensemble = EnsembleModel(base_models=base_models,
bagging_fraction=BAGGING_FRACTION,
model_cnt=MODEL_CNT)
model = GroupedOOFModel(base_model=ensemble,
group_column='ticker',
fold_cnt=FOLD_CNT)
return model
[docs]def FairMarketcapSF1(max_back_quarter: int=None,
min_back_quarter: int=None,
data_source: Optional[str]=None,
pretrained: bool=True,
verbose: bool=None) -> Pipeline:
'''
Model is used to estimate fair company marketcap for several last quarters.
Pipeline uses features from
:class:`~ml_investment.features.BaseCompanyFeatures`,
:class:`~ml_investment.features.QuarterlyFeatures`,
:class:`~ml_investment.features.DailyAggQuarterFeatures`,
:class:`~ml_investment.features.CommoditiesAggQuarterFeatures`
and trained to predict real market capitalizations
( using :class:`~ml_investment.targets.QuarterlyTarget` ).
Since some companies are overvalued and some are undervalued,
the model makes an average "fair" prediction.
:mod:`~ml_investment.data_loaders.sf1` and
:mod:`~ml_investment.data_loaders.quandl_commodities`
is used for loading data.
Note:
SF1 dataset is paid, so for using this model you need to subscribe
and paste quandl token to `~/.ml_investment/secrets.json`
``quandl_api_key``
Parameters
----------
max_back_quarter:
max quarter number which will be used in model
min_back_quarter:
min quarter number which will be used in model
data_source:
which data use for model. One of ['sf1', 'mongo'].
If 'mongo', than data will be loaded from db,
credentials specified at `~/.ml_investment/config.json`.
If 'sf1' - from folder specified at ``sf1_data_path``
in `~/.ml_investment/secrets.json`.
pretrained:
use pretreined weights or not.
Downloading directory path can be changed in
`~/.ml_investment/config.json` ``models_path``
verbose:
show progress or not
'''
if data_source is not None:
global DATA_SOURCE
DATA_SOURCE = data_source
if max_back_quarter is not None:
global MAX_BACK_QUARTER
MAX_BACK_QUARTER = max_back_quarter
if min_back_quarter is not None:
global MIN_BACK_QUARTER
MIN_BACK_QUARTER = min_back_quarter
if verbose is not None:
global VERBOSE
VERBOSE = verbose
if DATA_SOURCE == 'sf1':
_check_download_data()
data = _create_data()
feature = _create_feature()
target = _create_target()
model = _create_model()
pipeline = Pipeline(feature=feature,
target=target,
model=model,
data=data,
out_name=OUT_NAME)
core_path = '{}/{}.pickle'.format(config['models_path'], OUT_NAME)
if pretrained:
if not os.path.exists(core_path):
urlretrieve(URL, core_path)
pipeline.load_core(core_path)
return pipeline
[docs]def main(data_source):
'''
Default model training. Resulted model weights directory path
can be changed in `~/.ml_investment/config.json` ``models_path``
'''
pipeline = FairMarketcapSF1(pretrained=False, data_source=data_source)
base_df = pipeline.data['base'].load()
tickers = base_df[(base_df['currency'] == CURRENCY) &\
(base_df['scalemarketcap'].apply(lambda x: x in SCALE_MARKETCAP))
]['ticker'].values
result = pipeline.fit(tickers, median_absolute_relative_error)
print(result)
path = '{}/{}'.format(config['models_path'], OUT_NAME)
pipeline.export_core(path)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
arg = parser.add_argument
arg('--data_source', type=str)
args = parser.parse_args()
main(args.data_source)