Professional Documents
Culture Documents
A.ipynb - Colaboratory
A.ipynb - Colaboratory
ipynb - Colaboratory
import os
# kaggleden veri setini çekmek için buraya kendı bılgılerını gırmen gerkeıyor kaggle gırıs y
os.environ['KAGGLE_USERNAME'] = "mehmetbaaran" # username from the json file
os.environ['KAGGLE_KEY'] = "ed33689a8b7e311bf5b76e6d5f2e87ed" # key from the json file
!kaggle competitions download -c optiver-realized-volatility-prediction
!unzip optiver-realized-volatility-prediction.zip
inflating: trade_train.parquet/stock_id 116/96cd7e2d2ea74a62bd8b6d739fce17ca.parque
inflating: trade_train.parquet/stock_id=118/d5c040f03683418f87556865166846df.parque
inflating: trade_train.parquet/stock_id=119/52f2e172b3444107903ec31242331923.parque
inflating: trade_train.parquet/stock_id=120/0d06cf1025d84e19be32c9d9ffed9fd2.parque
inflating: trade_train.parquet/stock_id=122/a2e2aa41640f4986bebc1b95bd29966f.parque
inflating: trade_train.parquet/stock_id=123/168edf076f064342ad8aa3b37551cb8a.parque
inflating: trade_train.parquet/stock_id=124/8e3d48a3b163471eb121216a37c6de62.parque
inflating: trade_train.parquet/stock_id=125/a3f2c4430e1f4a3fb5837af2d970825d.parque
inflating: trade_train.parquet/stock_id=126/3baa2ff424c8435d92fe32cfeed80ae7.parque
inflating: trade_train.parquet/stock_id=13/2aeef69f03154de5835b9ba611d16172.parquet
inflating: trade_train.parquet/stock_id=14/50621e92b84e47a197071584bea4bf1d.parquet
inflating: trade_train.parquet/stock_id=15/43c3ace09a0a45b889073ce646bc6832.parquet
inflating: trade_train.parquet/stock_id=16/09bbef3150f744b2979fcab0ebf9e001.parquet
inflating: trade_train.parquet/stock_id=17/687d7abe51a24e2084bcb95eb8174e9b.parquet
inflating: trade_train.parquet/stock_id=18/06bb948e2275475e92fd90735050657f.parquet
inflating: trade_train.parquet/stock_id=19/7876f5e5338c4521982a7001d4489727.parquet
inflating: trade_train.parquet/stock_id=2/ca5a4d1f67024204ac7fd496a0b46ba3.parquet
inflating: trade_train.parquet/stock_id=20/4c8b1e3b619a4ee496d68962d30c6da4.parquet
inflating: trade_train.parquet/stock_id=21/1d8dc18ebfee47ffbb54b04e6afc0634.parquet
inflating: trade_train.parquet/stock_id=22/dfb544bbc3c34211bf12a6dfe6b584af.parquet
inflating: trade_train.parquet/stock_id=23/dfe7edcd05564b07983496d1b8a3b3c1.parquet
inflating: trade_train.parquet/stock_id=26/16a87247231944c4a0be45f797d14eb8.parquet
inflating: trade_train.parquet/stock_id=27/5eac589708444020a196eb738f06ce08.parquet
inflating: trade_train.parquet/stock_id=28/ec6f88c0ae114ff482cac8fb6f78e87d.parquet
inflating: trade_train.parquet/stock_id=29/ccea57f8a6324f969fe7144950c7369b.parquet
inflating: trade_train.parquet/stock_id=3/e0843aaf024f49228b281081a2524b39.parquet
inflating: trade_train.parquet/stock_id=30/8d772f1265d64c06add811d2ae912c7e.parquet
inflating: trade_train.parquet/stock_id=31/26f4296741054f9da7e7848521a80526.parquet
inflating: trade_train.parquet/stock_id=32/af90f42692874cefaa07e585d739bc7b.parquet
inflating: trade_train.parquet/stock_id=33/917a91c91ce04f62939b8710660ebb3b.parquet
inflating: trade_train.parquet/stock_id=34/a55d7f7c8ff5406c9ba9e4b0f8af0ada.parquet
inflating: trade_train.parquet/stock_id=35/e2480196f2a3426ea84e9b4284414bc0.parquet
inflating: trade_train.parquet/stock_id=36/4f713fbe94f542579ec55561a22db006.parquet
inflating: trade_train.parquet/stock_id=37/941cf9679a80466893da714884f5c4b0.parquet
inflating: trade_train.parquet/stock_id=38/e3ed94a67a444dc1a04838588ffa443f.parquet
inflating: trade_train.parquet/stock_id=39/364c0ad490cb4b9cb493c952b2124f17.parquet
inflating: trade_train.parquet/stock_id=4/761268d671f9429abb29d9d2895e9bd2.parquet
inflating: trade_train.parquet/stock_id=40/61456b44107d4b24bfe5d4ad90baaafe.parquet
inflating: trade_train.parquet/stock_id=41/bbde7b09f5e743508840782fac034d10.parquet
inflating: trade_train.parquet/stock_id=42/caab969eb87b4f7abfdd18d606a22ed0.parquet
inflating: trade_train.parquet/stock_id=43/bb0efa57f511470e817880842e3e2afa.parquet
inflating: trade_train.parquet/stock_id=44/bdfeb97d57a149049aecb2250af2c82a.parquet
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 1/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
inflating: trade_train.parquet/stock_id=46/7c55f1b4f4a34f83981f05974369ff6f.parquet
inflating: trade_train.parquet/stock_id=47/79bac282f1da4cb4aa24f7c667740341.parquet
inflating: trade_train.parquet/stock_id=48/507fb85f3c2a484f9fa4a78bcbfa992b.parquet
inflating: trade_train.parquet/stock_id=5/a5fd25253e3f43db884fe1e6fc2a06c3.parquet
inflating: trade_train.parquet/stock_id=50/935a65e6f4fc4e5990d086d6f6b4a932.parquet
inflating: trade_train.parquet/stock_id=51/f868a356824b4825a3c804206f513f70.parquet
inflating: trade_train.parquet/stock_id=52/54a5eea04ed042759b75b0bca90d1ced.parquet
inflating: trade_train.parquet/stock_id=53/caf58d3fd60c4a699d78ef114cf9ee17.parquet
inflating: trade_train.parquet/stock_id=55/1f19ad38e0d54c9680dc34dadf4dce47.parquet
inflating: trade_train.parquet/stock_id=56/b8615a9991bd4eada46a0483f9f9e3e6.parquet
inflating: trade_train.parquet/stock_id=58/b0e92dc583a544a08308d8c4e1df52af.parquet
inflating: trade_train.parquet/stock_id=59/2653e96292d24c64b27c0b70de53b2f9.parquet
inflating: trade_train.parquet/stock_id=6/fcf85e8ee88944d7bcbc9fc9862ee3f1.parquet
inflating: trade_train.parquet/stock_id=60/ae35d2b5825d4f1c9b52f462862eae04.parquet
inflating: trade_train.parquet/stock_id=61/fab533c5a5a741d8929f391890a4d54d.parquet
inflating: trade train.parquet/stock id=62/8df3e07ae2b74ad19fd98d4b9acfcbf2.parquet
from tqdm.notebook import tqdm
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
1. Introduction
This competition's objective is predicting short-term volatility for 112 stocks across different
sectors. Dataset consist book and trade data of the stocks for multiple time buckets. We are
supposed to predict a target value (volatility) for every time bucket of the stocks. 107 of the stocks
have 3830, 3 of the stocks have 3829, 1 of the stocks has 3820 and 1 of the stocks has 3815 time
buckets in training dataset. Thus, there are 428,932 rows to predict in it. Since this is a code
competition, public test set is hidden and private test set will be real market data collected in the
three-month evaluation period after competition ends. There are 3 columns in training set and 2
columns in placeholder test set.
train_test_dtypes = {
'stock_id': np.uint8,
'time_id': np.uint16,
'target': np.float64
df_train = pd.read_csv('./train.csv', dtype=train_test_dtypes)
df_test = pd.read_csv('./test.csv', usecols=['stock_id', 'time_id'], dtype=train_test_dtypes)
print(f'Training Set Shape: {df_train.shape}')
print(f'Training Set Memory Usage: {df_train.memory_usage().sum() / 1024 ** 2:.2f} MB')
print(f'Test Set Shape: {df_test.shape}')
print(f'Test Set Memory Usage: {df_test.memory_usage().sum() / 1024 ** 2:.2f} MB')
2. Evaluation
Submissions are scored using RMSPE (root mean squared percentage error) which can be denoted
as
− −−−−−−−−−−−−−−− −
1 n 2
RMSPE = √ ∑ ^ )/y )
((y − y
n i=1 i i i
RMSPE is very similar to RMSE. The only difference between them is, the error is divided by the
actual value. Predictions closer to actual values yield errors closer to 0, so division by actual values
is sensitive to larger errors. In addition to that, errors are squared before they are averaged, which
makes this metric even more sensitive to larger errors. This means larger errors are not tolerable in
this domain.
One pitfall of RMSPE is it can raise ZeroDivisionError if a single data point in actual values is equal
to 0. Even though, there isn't any 0 target values in training set, this can be easily solved by adding a
small constant to actual values. A small constant epsilon wouldn't contribute to overall rmspe and
it prevents ZeroDivisionError.
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 3/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
For baselines, global target mean scores 1.11033, target mean of stock_ids scores 0.789618, and
target median of stock_ids scores 0.589135 on training set.
def root_mean_squared_percentage_error(y_true, y_pred, epsilon=1e-10):
rmspe = np.sqrt(np.mean(np.square((y_true - y_pred) / (y_true + epsilon))))
return rmspe
target_mean_rmspe = root_mean_squared_percentage_error(df_train['target'], np.repeat(df_train
print(f'target Mean RMPSE: {target_mean_rmspe:.6}')
stock_id_target_mean_rmspe = root_mean_squared_percentage_error(df_train['target'], df_train.
print(f'stock_id target Mean RMPSE: {stock_id_target_mean_rmspe:.6}')
stock_id_target_median_rmspe = root_mean_squared_percentage_error(df_train['target'], df_trai
print(f'stock_id target Median RMPSE: {stock_id_target_median_rmspe:.6}')
Realized volatility, σ, is the squared root of the sum of squared log returns which is denoted as
−−−−− −−
2
σ = √∑ r
t t−1,t
S t−1
rt−1,t = log( )
S t1
where St is the prize of the stock S at time t . The price used for targets is the weighted averaged
price (WAP) and it can be derived from book data. Log returns are used for realized volatility
calculation because price differences are not always comparable across stocks.
def visualize_target(target):
print(f'{target}\n{"-" * len(target)}')
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 4/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
print(f'Mean: {df_train[target].mean():.4f} - Median: {df_train[target].median():.4f}
print(f'Min: {df_train[target].min():.4f} - 25%: {df_train[target].quantile(0.25):.4f}
print(f'Skew: {df_train[target].skew():.4f} - Kurtosis: {df_train[target].kurtosis():.4
missing_values_count = df_train[df_train[target].isnull()].shape[0]
training_samples_count = df_train.shape[0]
print(f'Missing Values: {missing_values_count}/{training_samples_count} ({missing_values_
fig, axes = plt.subplots(ncols=2, figsize=(24, 8), dpi=100)
sns.kdeplot(df_train[target], label=target, fill=True, ax=axes[0])
axes[0].axvline(df_train[target].mean(), label=f'{target} Mean', color='r', linewidth=2,
axes[0].axvline(df_train[target].median(), label=f'{target} Median', color='b', linewidth
probplot(df_train[target], plot=axes[1])
axes[0].legend(prop={'size': 16})
for i in range(2):
axes[i].tick_params(axis='x', labelsize=12.5, pad=10)
axes[i].tick_params(axis='y', labelsize=12.5, pad=10)
axes[i].set_xlabel('')
axes[i].set_ylabel('')
axes[0].set_title(f'{target} Distribution in Training Set', fontsize=20, pad=15)
axes[1].set_title(f'{target} Probability Plot', fontsize=20, pad=15)
plt.show()
Distribution of target is highly right-skewed and there are extreme outliers since some of the stocks
are very volatile. Those extreme outliers can be spotted on the probability plot.
visualize_target('target')
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 5/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
target
------
Min: 0.0001 - 25%: 0.0020 - 50%: 0.0030 - 75%: 0.0047 - Max: 0.0703
Each stock is very different in terms of volatility. Stocks are displayed below from most volatile to
least volatile. They are ranked by their mean target value across multiple time_ids. Wide error bars
show that even the realized volatility is volatile between multiple time buckets. One stock can be
extremely volatile in one time bucket and less volatile in another time bucket. This phenomenon
can be explained by time_ids not being sequential, and there aren't any temporal dependencies
between them.
target_means = df_train.groupby('stock_id')['target'].mean()
target_stds = df_train.groupby('stock_id')['target'].std()
target_means_and_stds = pd.concat([target_means, target_stds], axis=1)
target_means_and_stds.columns = ['mean', 'std']
target_means_and_stds.sort_values(by='mean', ascending=True, inplace=True)
fig, ax = plt.subplots(figsize=(32, 48))
ax.barh(
y=np.arange(len(target_means_and_stds)),
width=target_means_and_stds['mean'],
xerr=target_means_and_stds['std'],
align='center',
ecolor='black',
capsize=3
)
ax.set_yticks(np.arange(len(target_means_and_stds)))
ax.set_yticklabels(target_means_and_stds.index)
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set title('Mean Realized Volatility of Stocks', size=25, pad=20)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 6/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
_ ( y , , p )
plt.show()
del target_means, target_stds, target_means_and_stds
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 7/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 8/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
Instead of entire stocks, individual time buckets from different stocks are ranked based on their
realized volatility. The most volatile 10 time buckets can be seen below. The most volatile time
bucket belongs to stock 77 and its time_id is 24600. The most volatile stock was stock 18 and it
has 3 time buckets in this list.
df_train['stock_time_id'] = df_train['stock_id'].astype(str) + '_' + df_train['time_id'].asty
fig, ax = plt.subplots(figsize=(32, 10))
ax.barh(
y=np.arange(10),
width=df_train.sort_values(by='target', ascending=True).tail(10)['target'],
align='center',
ecolor='black',
)
ax.set_yticks(np.arange(10))
ax.set_yticklabels(df_train.sort_values(by='target', ascending=True).tail(10)['stock_time_id'
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_time_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Top 10 Most Volatile Time Buckets', size=25, pad=20)
plt.show()
df_train.drop(columns=['stock_time_id'], inplace=True)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 9/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
The least volatile 10 time buckets are also visualized and they can be seen below. All of the least
volatile 10 time buckets belong to stock 31, even though it has an average volatility overall. This
could be an anomaly and it must be explored further.
df_train['stock_time_id'] = df_train['stock_id'].astype(str) + '_' + df_train['time_id'].asty
fig, ax = plt.subplots(figsize=(32, 10))
ax.barh(
y=np.arange(10),
width=df_train.sort_values(by='target', ascending=True).head(10)['target'],
align='center',
ecolor='black',
)
ax.set_yticks(np.arange(10))
ax.set_yticklabels(df_train.sort_values(by='target', ascending=True).head(10)['stock_time_id'
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_time_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set title('Top 10 Least Volatile Time Buckets', size=25, pad=20)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 10/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
ax.set_title( Top 10 Least Volatile Time Buckets , size 25, pad 20)
plt.show()
df_train.drop(columns=['stock_time_id'], inplace=True)
4. Order Book
Order book is a list of buy and sell orders organized by price level for given stocks. An order book
lists the number of shares being bid on or asked at each price point. Order books help to improve
market transparency as they provide information on price, availability and depth of trade.
Order book data files are named as book_train.parquet and book_test.parquet , and they are
parquet files partitioned by stock_id column. Partitioned by stock_id means those files can both
read as whole or individual stocks. However, reading them as a single file is not easy as they will
consume too much memory.
There are 10 columns in every book data partition. The columns are:
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 11/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
Values in order book are the last snapshots of each second. Some of the seconds are not available
because there weren't any related market activities during those seconds, thus order books are not
updated. Normally, given order books last 600 seconds for every time bucket. Values of the
missing seconds are the values at the last updated second, so the book data can be reindexed to
600 seconds for every time bucket and missing values can be forward filled for every field. Forward
filling and sorting order books by time_id and seconds_in_bucket doesn't change the extracted
features but these functionalities can be added for private test set as a sanity check.
def read_book_data(dataset, stock_id, sort=False, forward_fill=False):
book_dtypes = {
'time_id': np.uint16,
'seconds_in_bucket': np.uint16,
'bid_price1': np.float32,
'ask_price1': np.float32,
'bid_price2': np.float32,
'ask_price2': np.float32,
'bid_size1': np.uint32,
'ask_size1': np.uint32,
'bid_size2': np.uint32,
'ask_size2': np.uint32,
}
df_book = pd.read_parquet(f'./book_{dataset}.parquet/stock_id={stock_id}')
for column, dtype in book_dtypes.items():
df_book[column] = df_book[column].astype(dtype)
if sort:
df_book.sort_values(by=['time_id', 'seconds_in_bucket'], inplace=True)
if forward_fill:
df_book = df_book.set_index(['time_id', 'seconds_in_bucket'])
df_book = df_book.reindex(pd.MultiIndex.from_product([df_book.index.levels[0], np.ara
df_book.reset_index(inplace=True)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 12/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
return df_book
As mentioned, realized volatilities are calculated from the weighted averaged price in order books
of every stock. The formula for weighted averaged price is
Realized volatilities are calculated using the most competitive buy and sell levels, but same
formula can applied to second most competitive buy and sell levels or other prices/sizes as well.
After that, log returns of the weighted averaged price are summed and square rooted for realized
volatility calculation.
for stock_id in tqdm(sorted(df_train['stock_id'].unique())):
df_book = read_book_data('train', stock_id)
# Weighted averaged prices
df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] *
(df_book['bid_size1'] + df_book['ask_size1'])
df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] *
(df_book['bid_size2'] + df_book['ask_size2'])
# Realized volatilities
for wap in [1, 2]:
df_book[f'log_return_from_wap{wap}'] = np.log(df_book[f'wap{wap}'] / df_book.groupby(
df_book[f'squared_log_return_from_wap{wap}'] = df_book[f'log_return_from_wap{wap}'] *
df_book[f'realized_volatility_from_wap{wap}'] = np.sqrt(df_book.groupby('time_id')[f'
df_book.drop(columns=[f'squared_log_return_from_wap{wap}'], inplace=True)
realized_volatilities = df_book.groupby('time_id')[f'realized_volatility_from_wap{wap
df_train.loc[df_train['stock_id'] == stock_id, f'realized_volatility_from_wap{wap}']
Realized volatilies calculated in the cell above belongs to current 10 minute window. They can be
used as baseline predictions. Realized volatilities calculated from WAP1 scores 0.341354 and
realized volatilities calculated from WAP2 scores 0.705453 on training set. They should be used as
predictors in models since realized volatilities of the current 10 minute window is very valuable
information. Targets are most likely be closer to those values.
realized_volatility_wap1_rmspe = root_mean_squared_percentage_error(df_train['target'], df_tr
print(f'Realized Volatility from WAP1 RMPSE: {realized_volatility_wap1_rmspe:.6}')
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 13/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
realized_volatility_wap2_rmspe = root_mean_squared_percentage_error(df_train['target'], df_tr
print(f'Realized Volatility from WAP2 RMPSE: {realized_volatility_wap2_rmspe:.6}')
The order book helps traders to make more informed trading decisions by showing order
imbalances that may provide clues to a stock’s direction in the very short term. A huge imbalance
of buy orders against sell orders may indicate a move higher in the stock due to buying pressure, or
vice versa. Traders can also use the order book to help pinpoint a stock’s potential support and
resistance levels. A cluster of large buy orders at a specific price may indicate a level of support,
while an abundance of sell orders at or near one price may suggest an area of resistance.
Realized
volatilities increase when those moves in either directions become more frequent. The function in
the cell below is going to be used for visualizing individual order book time buckets and trying to
find clues about their volatilities.
def visualize_book_time_bucket(stock_id, time_id):
time_bucket = (df_train['stock_id'] == stock_id) & (df_train['time_id'] == time_id)
target = df_train.loc[time_bucket, 'target'].iloc[0]
realized_volatility = df_train.loc[time_bucket, 'realized_volatility_from_wap1'].iloc[0]
df_book = read_book_data('train', stock_id, sort=True, forward_fill=True)
df_book = df_book.set_index('seconds_in_bucket')
df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] *
(df_book['bid_size1'] + df_book['ask_size1'])
df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] *
(df_book['bid_size2'] + df_book['ask_size2'])
fig, axes = plt.subplots(figsize=(32, 30), nrows=2)
axes[0].plot(df_book.loc[df_book['time_id'] == time_id, 'bid_price1'], label='bid_price1'
axes[0].plot(df_book.loc[df_book['time_id'] == time_id, 'ask_price1'], label='ask_price1'
axes[0].plot(df_book.loc[df_book['time_id'] == time_id, 'bid_price2'], label='bid_price2'
axes[0].plot(df_book.loc[df_book['time_id'] == time_id, 'ask_price2'], label='ask_price2'
axes[0].plot(df_book.loc[df_book['time_id'] == time_id, 'wap1'], label='wap1', lw=2, line
axes[0].plot(df_book.loc[df_book['time_id'] == time_id, 'wap2'], label='wap2', alpha=0.3,
axes[1].plot(df_book.loc[df_book['time_id'] == time_id, 'bid_size1'], label='bid_size1',
axes[1].plot(df_book.loc[df_book['time_id'] == time_id, 'ask_size1'], label='ask_size1',
axes[1].plot(df_book.loc[df_book['time_id'] == time_id, 'bid_size2'], label='bid_size2',
axes[1].plot(df_book.loc[df_book['time_id'] == time_id, 'ask_size2'], label='ask_size2',
for i in range(2):
axes[i].legend(prop={'size': 18})
axes[i].tick_params(axis='x', labelsize=20, pad=10)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 14/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
axes[i].tick_params(axis='y', labelsize=20, pad=10)
axes[0].set_ylabel('price', size=20, labelpad=15)
axes[1].set_ylabel('size', size=20, labelpad=15)
axes[0].set_title(
f'Prices of stock_id {stock_id} time_id {time_id} - Current Realized Volatility: {rea
size=25,
pad=15
)
axes[1].set_title(
f'Sizes of stock_id {stock_id} time_id {time_id} - Current Realized Volatility: {real
size=25,
pad=15
)
plt.show()
The most volatile time bucket in next 10 minute window is time 24600 from stock 77. Current
realized volatility of that time bucket is 0.02 which is roughly greater than 99.7% of other current
realized volatilities. The time bucket was extremely volatile in current 10 minute window and it
became the most volatile time bucket in next 10 minute window. Short-term realized volatilities are
definitely correlated in this case. First of all, this time bucket is on a decreasing trend because large
amount of most competitive ask sizes are always dominating most competitive bid sizes. Some
traders were trying to sell lots of shares at most competitive price very frequently. In addition to
that, this time bucket has more updates compared to others. Update frequency of time buckets
might be correlated to realized volatility since more squared log return values greater than 0 will be
summed.
visualize_book_time_bucket(stock_id=77, time_id=24600)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 15/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 16/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
The most volatile time bucket in current 10 minute window is time 30128 from stock 30. Realized
volatility of the next 10 minute window for this time bucket is 0.039 which is roughly greater than
%99.994 of other target values. In this case, this time bucket became less volatile in next 10 minute
window and that suggests it is really hard to tell whether the realized volatility will be more or less
in short-term, but it is expected to be closer to current realized volatility. This stock was extremely
volatile in this time bucket because large amount of sell orders at most competitive prize
dominated buy orders in first quarter and the opposite happened in second quarter. Later, buy/sell
order domination kept switching places in smaller scales and that caused smaller spikes in
weigted averaged price 1.
visualize_book_time_bucket(stock_id=30, time_id=30128)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 17/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
The least volatile time bucket in next 10 minute window is time 8534 from stock 31. The sizes of
most competitive bid and ask orders were always balanced until the very end. Couple changes at
the end caused a small decrease and increase in weighted average price 1. That was the only
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 18/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
significant contribution to current realized volatility. This time bucket became almost 20 times less
volatile in next 10 minute window.
visualize_book_time_bucket(stock_id=31, time_id=8534)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 19/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
The least volatile time bucket in current 10 minute window is time 28959 from stock 31. Prices and
sizes of most competitive bid and ask orders are exteremely balanced in this entire time bucket.
Sizes of most competitive bid level is shifted to a new state at the second half. That movement
slightly increased the weighted averaged price 1, and that was the only thing contributed to current
realized volatility. Stock 31 is very calm compared to other stocks. Sizes of most competitive bid
and ask levels could be the reason of this phenomenon, or it could be an anomaly.
visualize_book_time_bucket(stock_id=31, time_id=28959)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 20/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 21/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
To wrap up, order book is very important as it gives information about buy and sell orders at most
competitive levels. Target is also derived from order books of the stocks, so features extracted
from them will be very useful. Features that are capturing the price/size imbalances might be
useful for models to predict realized volatilities of next 10 minute window.
5. Trade
Trade data represents the aggregation of all individual executed orders for corresponding stocks
and time buckets. Size is the sum of the size in each individual order, price is aggregated as a
weighted averaged price of all trades and order count is the number of unique trade orders taking
place. For trade data, missing seconds_in_bucket implies no trade happening within that one
second window.
Trade data files are named as trade_train.parquet and trade_test.parquet , and they are
parquet files partitioned by stock_id column. Partitioned by stock_id means those files can both
read as whole or individual stocks. However, reading them as a single file is not easy as they will
consume too much memory.
There are 5 columns in every trade data partition. The columns are:
Trade data has lots of missing seconds_in_bucket as it is more sparse than order book. Besides,
there are some missing time buckets in trade data which means there weren't any trades in that 10
minute window. In addition to that, seconds_in_bucket doesn't necessarily start from 0 in trade
data. Trade data can be still reshaped into a fixed size by adding missing time buckets and
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 22/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
reindexing time steps to 600 seconds. Missing values can be filled with zeros afterwards since
there weren't any trades at those one second windows.
def read_trade_data(df, dataset, stock_id, sort=False, zero_fill=False):
trade_dtypes = {
'time_id': np.uint16,
'seconds_in_bucket': np.uint16,
'price': np.float32,
'size': np.uint16,
'order_count': np.uint16
}
df_trade = pd.read_parquet(f'./trade_{dataset}.parquet/stock_id={stock_id}')
if zero_fill:
stock_time_buckets = df.loc[df['stock_id'] == stock_id, 'time_id'].reset_index(drop=T
missing_time_buckets = stock_time_buckets[~stock_time_buckets.isin(df_trade['time_id'
df_trade = df_trade.merge(missing_time_buckets, how='outer')
if sort:
df_trade.sort_values(by=['time_id', 'seconds_in_bucket'], inplace=True)
if zero_fill:
df_trade = df_trade.set_index(['time_id', 'seconds_in_bucket'])
df_trade = df_trade.reindex(
pd.MultiIndex.from_product([df_trade.index.levels[0], np.arange(0, 600)], names=[
)
df_trade.fillna(0, inplace=True)
df_trade.reset_index(inplace=True)
for column, dtype in trade_dtypes.items():
df_trade[column] = df_trade[column].astype(dtype)
return df_trade
Previously, realized volatilities were derived from weighted averaged price of the order book, but
they can be derived from trade price as well since it is the weighted averaged price of the trade
orders taking place. Realized volatilities derived from trade price are different from the previous
ones because trade data is more sparse.
for stock_id in tqdm(sorted(df_train['stock_id'].unique())):
df_trade = read_trade_data(df_train, 'train', stock_id)
# Realized volatility
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 23/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
for wap in [1, 2]:
df_trade['log_return_from_price'] = np.log(df_trade['price'] / df_trade.groupby('time
df_trade['squared_log_return_from_price'] = df_trade['log_return_from_price'] ** 2
df_trade['realized_volatility_from_price'] = np.sqrt(df_trade.groupby('time_id')['squ
df_trade.drop(columns=['squared_log_return_from_price'], inplace=True)
realized_volatilities = df_trade.groupby('time_id')['realized_volatility_from_price']
df_train.loc[df_train['stock_id'] == stock_id, 'realized_volatility_from_price'] = df
Realized volatilies derived from trade price are used as baseline predictions. They score 0.380267
on training set. They can be used as predictors just like the realized volatilities derived from the
order book.
realized_volatility_price_rmspe = root_mean_squared_percentage_error(df_train['target'], df_t
print(f'Realized Volatility from price RMPSE: {realized_volatility_price_rmspe:.6}')
Trade data tells how many shares are traded at what weighted averaged price by how many unique
traders for every time step. Those traded shares are the ones listed in the order book at the
corresponding time step. Oscillations of trade price, size and order count might be an indicator of
realized volatility of the next 10 minute window. Another important thing to consider is sparsity.
Sparsity indicates that the stock is calm and there aren't any trades taking place which could be a
strong indicator of low realized volatility. However, it may not be a predictor of the next 10 minute
windows' realized volatility. The function in the cell below is going to be used for visualizing
individual trade data time buckets and trying to find clues about their volatilities. It is important to
visualize every sequence with and without zeros in order to understand both oscillations and
sparsity.
def visualize_trade_time_bucket(stock_id, time_id):
time_bucket = (df_train['stock_id'] == stock_id) & (df_train['time_id'] == time_id)
target = df_train.loc[time_bucket, 'target'].iloc[0]
realized_volatility = df_train.loc[time_bucket, 'realized_volatility_from_wap1'].iloc[0]
df_trade = read_trade_data(df_train, 'train', stock_id, sort=True, zero_fill=True)
df_trade = df_trade.set_index('seconds_in_bucket')
fig, axes = plt.subplots(figsize=(32, 70), nrows=6)
axes[0].plot(df_trade.loc[(df_trade['time_id'] == time_id) & (df_trade['price'] != 0), 'p
axes[1].plot(df_trade.loc[(df_trade['time_id'] == time_id), 'price'], label='price with z
axes[2].plot(df_trade.loc[(df_trade['time_id'] == time_id) & (df_trade['price'] != 0), 's
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 24/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
axes[3].plot(df_trade.loc[(df_trade['time_id'] == time_id), 'size'], label='size with zer
axes[4].plot(df_trade.loc[(df_trade['time_id'] == time_id) & (df_trade['price'] != 0), 'o
axes[5].plot(df_trade.loc[(df_trade['time_id'] == time_id), 'order_count'], label='order_
for i in range(6):
axes[i].legend(prop={'size': 18})
axes[i].tick_params(axis='x', labelsize=20, pad=10)
axes[i].tick_params(axis='y', labelsize=20, pad=10)
axes[0].set_ylabel('price', size=20, labelpad=15)
axes[1].set_ylabel('price', size=20, labelpad=15)
axes[2].set_ylabel('size', size=20, labelpad=15)
axes[3].set_ylabel('size', size=20, labelpad=15)
axes[4].set_ylabel('order_count', size=20, labelpad=15)
axes[5].set_ylabel('order_count', size=20, labelpad=15)
axes[0].set_title(
f'Price of stock_id {stock_id} time_id {time_id} without zeros - Current Realized Vol
size=25,
pad=15
)
axes[1].set_title(
f'Price of stock_id {stock_id} time_id {time_id} with zeros - Current Realized Volati
size=25,
pad=15
)
axes[2].set_title(
f'Size of stock_id {stock_id} time_id {time_id} without zeros - Current Realized Vola
size=25,
pad=15
)
axes[3].set_title(
f'Size of stock_id {stock_id} time_id {time_id} with zeros - Current Realized Volatil
size=25,
pad=15
)
axes[4].set_title(
f'Order count of stock_id {stock_id} time_id {time_id} without zeros - Current Realiz
size=25,
pad=15
)
axes[5].set_title(
f'Order count of stock_id {stock_id} time_id {time_id} with zeros - Current Realized
size=25,
pad=15
)
plt.show()
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 25/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
As seen before, the most volatile time bucket in next 10 minute window is time 24600 from stock
77. Current realized volatility of that time bucket is 0.02 which is roughly greater than 99.7% of
other current realized volatilities. Trade price without zeros is almost perfectly correlated with
weighted averaged price from the order book and trade price with zeros shows that there are
trades taking place very frequently. Size shows that there are some extreme amounts of shares
traded at certain times by one or multiple traders. Finally, order count shows that there are large
number of unique trade orders from time to time regardless of the total size and price.
visualize_trade_time_bucket(stock_id=77, time_id=24600)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 26/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 27/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 28/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
As seen before, the least volatile time bucket in next 10 minute window is time 8534 from stock 31.
There are only 4 trades took place in this time bucket which is why it's current realized volatility and
next 10 minute window realized volatility is very low. Regardless of the values, sparsity is the
obvious reason in this case.
visualize_trade_time_bucket(stock_id=31, time_id=8534)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 29/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 30/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 31/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
There are some time buckets with zero trades exist in trade data. Time 62 from stock 37 is an
example of those time buckets. Those outliers should be handled differently from other stocks
since they have empty trade data.
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 32/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
visualize_trade_time_bucket(stock_id=37, time_id=62)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 33/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 34/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 35/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
6. Feature Engineering
Feature engineering is very important in this competition because it won't be easy to use neural
networks directly on time buckets. Previously, realized volatilities of current 10 minute window
were extracted from WAP1 and WAP2, but the same aggregation can be applied to any sequence
since it tells the total amount of movement happened in it. Other simple aggregations like mean,
standard deviation, min and max can also be used for feature engineering.
A very simple feature bid_ask_price_ratio is created while iterating over order books. Multiple
aggregations on that simple feature might yield valuable information about realized volatility of the
next 10 minute window. Aggregations on simple features like that or aggregations on raw
sequences is the most straightforward way of feature engineering. Before moving forward to
advanced features, simple feature aggregations should be fully explored.
def log_return(x):
return np.log(x).diff()
def realized_volatility(x):
return np.sqrt(np.sum(log_return(x) ** 2))
for stock_id in tqdm(sorted(df_train['stock_id'].unique())):
df_book = read_book_data('train', stock_id)
# Bid/Ask ratio aggregations
df_book['bid_ask_price_ratio'] = df_book['bid_price1'] / df_book['ask_price1']
for agg in ['mean', 'std', 'min', 'max', realized_volatility]:
bid_ask_price_ratio_aggregation = df_book.groupby('time_id')['bid_ask_price_ratio'].a
feature_name = agg.__name__ if callable(agg) else agg
df_train.loc[df_train['stock_id'] == stock_id, f'book_bid_ask_price_ratio_{feature_na
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 36/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
# Weighted averaged prices
df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] + df_book['ask_price1'] *
(df_book['bid_size1'] + df_book['ask_size1'])
df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] + df_book['ask_price2'] *
(df_book['bid_size2'] + df_book['ask_size2'])
for wap in [1, 2]:
for agg in ['mean', 'std', 'min', 'max', realized_volatility]:
wap_aggregation = df_book.groupby('time_id')[f'wap{wap}'].agg(agg)
feature_name = agg.__name__ if callable(agg) else agg
df_train.loc[df_train['stock_id'] == stock_id, f'wap{wap}_{feature_name}'] = df_t
Aggregations on bid_ask_price_ratio are either extremely left or right skewed because lots of
values are clustered around 0 or 1. However, there is a slight relationship between target and
bid_ask_price_ratio aggregations. Mean, min and max of weighted averaged prices are probably
not informative because they don't provide any information about the movements. Standard
deviation of raw sequences and weighted averaged prices would be useful because it tells the
amount of variation. Finally, distributions of WAP1 and WAP2 realized volatilities are very similar to
distribution of target which suggests that they are correlated with next 10 minute window realized
volatility.
def visualize_continuous_feature(continuous_feature):
print(f'{continuous_feature}\n{"-" * len(continuous_feature)}')
print(f'Training Mean: {float(df_train[continuous_feature].mean()):.4} - Training Median
print(f'Training Min: {float(df_train[continuous_feature].min()):.4} - Training Max: {fl
print(f'Training Skew: {float(df_train[continuous_feature].skew()):.4} - Training Kurtos
fig, axes = plt.subplots(ncols=2, figsize=(24, 6), dpi=100, constrained_layout=True)
title_size = 18
label_size = 18
sns.kdeplot(df_train[continuous_feature], label='Training', fill=True, ax=axes[0])
axes[0].set_xlabel('')
axes[0].tick_params(axis='x', labelsize=label_size)
axes[0].tick_params(axis='y', labelsize=label_size)
axes[0].legend()
axes[0].set_title(f'{continuous_feature} Distribution in Training Set', size=title_size,
sns.scatterplot(x=df_train[continuous_feature], y=df_train['target'], ax=axes[1])
axes[1].set_title(f'{continuous_feature} vs target', size=title_size, pad=title_size)
axes[1].set_xlabel('')
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 37/38
4/8/22, 8:36 PM a.ipynb - Colaboratory
axes[1].set_ylabel('')
axes[1].tick_params(axis='x', labelsize=label_size)
axes[1].tick_params(axis='y', labelsize=label_size)
plt.show()
for continuous_feature in df_train.columns[6:]:
visualize_continuous_feature(continuous_feature)
https://colab.research.google.com/drive/1nS6OjouSRXf5rM3jabtPHfO2sDNX2ZsI#scrollTo=36a5e5de&printMode=true 38/38