Source code for clairvoyant.history

"""History manages historical stock timeseries data.

This module provides a common interface for ``Clair`` so that she knows how
your data is formatted. This requires you to define a column map for your data,
which maps your column names to common names that Clair understands.
"""

from pytz import timezone
import pandas as pd
from copy import deepcopy


[docs]class History:
    """A wrapper for historical stock data.

    You can query for a row by date::

        history['2017-02-14 06:30:00']  # get data by a specific date

    You can slice using datetime objects or index numbers::

        history[startDate:endDate]  # get data between startDate and endDate
        history[0:100]              # get rows between 0 and 100

    You can get individual records by index::

        history[10]  # gets a row of data

    You can access a column of data by key just like a dataframe::

        history['Open']  # gets a column of data
        history.open     # or access the same data by attribute

    :param data: Client stock data. Can be a string representing a csv file or
                 it can be a pandas dataframe.
    :param col_map: A dict mapping your data's column names to common names
                    where the common names are keys and your custom names are
                    values. This is an optional parameter. If ``None`` is
                    provided, History will assume client data is already
                    formatted with common names.
    :param tz: The timezone to associate with the datetime in data. Default is
               UTC time.
    :param features: A list of column names that informs Clair which columns
                     can be used as learning features.

    :ivar date: Datetime series in data corresponding to the beginning of each
                period.
    :ivar open: Opening stock price series.
    :ivar high: Series of stock price highs.
    :ivar low: Series of stock price lows.
    :ivar close: Closing stock price series.
    :ivar volume: Series of stock price trading volume.
    :ivar return_rate: Series of percentage change calculated as a percent of
                       opening price.
    """

    def __init__(self, data, col_map=None, tz=timezone('UTC'), features=None):
        if col_map is None:
            self._col_map = {
                'Date': 'Date', 'Open': 'Open', 'High': 'High', 'Low': 'Low',
                'Close': 'Close', 'Volume': 'Volume', 'Sentiment': 'Sentiment',
                'Influence': 'Influence'
                }
        else:
            self._col_map = col_map

        if isinstance(data, str):
            self._df = self.read_csv(data)
        else:
            self._df = data

        # make sure all column names can be converted in itertuple
        newnames = {v: k.lower() for k, v in self._col_map.items()}
        self.rename(columns=newnames)

        if features is None:
            self.features = ['Sentiment', 'Influence']
        else:
            self.features = features

        self._timezone = tz
        self._df['Return'] = (self.close - self.open)/self.open
        self._col_map['Return'] = 'Return'

    @property
    def date(self):
        return self['Date']

    @property
    def open(self):
        return self['Open']

    @property
    def high(self):
        return self['High']

    @property
    def low(self):
        return self['Low']

    @property
    def close(self):
        return self['Close']

    @property
    def volume(self):
        return self['Volume']

    @property
    def return_rate(self):
        return self['Return']

    @property
    def features(self):
        return self._features

    @features.setter
    def features(self, vals):
        for v in vals:
            if v not in self._col_map.keys():
                raise KeyError(f'\'{v}\' is not a valid column.')
        self._features = vals

    def __deepcopy__(self, memo):
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

    def __getitem__(self, key):
        if isinstance(key, slice):
            dc = deepcopy(self)
            if isinstance(key.start, int):
                dc._df = dc._df[key]
                return dc
            dc._df['dt'] = pd.to_datetime(dc._df[dc._col_map['Date']])
            try:
                dc._df['dt'].apply(dc._timezone.localize)
            except ValueError:
                pass
            mask = (dc._df['dt'] >= key.start) & (dc._df['dt'] <= key.stop)
            dc._df = dc._df[mask]
            dc._df = dc._df.drop('dt', 1)
            return dc
        elif isinstance(key, int):
            return self._df.iloc[key]

        try:
            return self._df[self._col_map[key]]
        except KeyError:
            try:
                pd.to_datetime(key)  # test conversion to datetime
                datekey = self._col_map['Date']
                return self._df.loc[self._df[datekey] == key]
            except ValueError:
                pass
            print(f'Invalid column map for {key}.')
            raise

    def __iter__(self):
        return self._df.itertuples()

    def __len__(self):
        return len(self._df)

[docs]    def read_csv(self, *args, **kwargs):
        """Read a csv file.

        Exact same interface as ``pandas.read_csv``.
        """
        return pd.read_csv(*args, **kwargs)

[docs]    def rename(self, *args, **kwargs):
        """Rename the stored dataframe columns.

        Exposes the exact same interface as ``pandas.DataFrame.rename``.
        """
        old_cols = {v: k for k, v in self._col_map.items()}
        for old_col, new_col in kwargs['columns'].items():
            try:
                self._col_map[old_cols[old_col]] = new_col
            except KeyError:
                continue

        self._df = self._df.rename(*args, **kwargs)