Skip to content

Latest commit

 

History

History
164 lines (128 loc) · 6.62 KB

File metadata and controls

164 lines (128 loc) · 6.62 KB

Hybrid Approach: Logistic Regression and Random Forests for HFT

Objective: Develop a trinomial classifier for predicting market direction in high-frequency trading. The classifier will use logistic regression as the primary model and random forests as the secondary model.

1. Data Preparation

Prepare and preprocess your data to create features and labels suitable for classification.

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load data
data = pd.read_csv('/path/to/order_book_data.csv')

# Feature engineering: Create VOI, OIR, MPB
# VOI (Volume Order Imbalance): Difference between bid and ask volumes
data['VOI'] = data['bid_volume'] - data['ask_volume']

# OIR (Order Imbalance Ratio): Normalized difference between bid and ask volumes
data['OIR'] = (data['bid_volume'] - data['ask_volume']) / (data['bid_volume'] + data['ask_volume'])

# MPB (Mid-Price Basis): Normalized difference between mid-price and its previous value
data['MPB'] = (data['mid_price'] - data['mid_price'].shift(1)) / data['spread']

# Include lags for VOI, OIR, MPB (up to 5 lags)
# Lags help capture temporal dependencies and improve model's predictive power
for lag in range(1, 6):
    data[f'VOI_lag{lag}'] = data['VOI'].shift(lag)
    data[f'OIR_lag{lag}'] = data['OIR'].shift(lag)
    data[f'MPB_lag{lag}'] = data['MPB'].shift(lag)

# Drop rows with NaN values due to lagging
data.dropna(inplace=True)

# Features and labels
features = data.drop(columns=['future_price_movement'])
labels = data['future_price_movement']

# Split data into training and testing sets
# Ensure no data leakage by maintaining the temporal order
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, shuffle=False)

# Standardize features to have zero mean and unit variance
# Standardization helps in improving the convergence of some machine learning algorithms
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

2. Primary Model: Logistic Regression

Train the logistic regression model and evaluate its performance.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Train logistic regression model
# Multinomial logistic regression is used for multi-class classification
# The solver 'lbfgs' is efficient for small datasets and handles multinomial loss well
lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict and evaluate the model on the test set
# Classification report provides detailed metrics (precision, recall, f1-score) for each class
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_lr))

3. Secondary Model: Random Forests

Train the random forest model and evaluate its performance.

from sklearn.ensemble import RandomForestClassifier

# Train random forest model
# Random forest is an ensemble method that reduces overfitting by averaging multiple decision trees
# n_estimators: number of trees in the forest
# max_depth: maximum depth of the tree to prevent overfitting
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the model on the test set
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Classification Report:\n", classification_report(y_test, y_pred_rf))

4. Hybrid Approach: Combining Predictions

Combine the predictions from logistic regression and random forests to create a hybrid model.

import numpy as np
from sklearn.metrics import accuracy_score

# Combine predictions using majority voting
# For simplicity, we take the average of the predictions from both models and round to the nearest integer
# This is a basic form of ensemble learning to leverage the strengths of both models
combined_pred = np.round((y_pred_lr + y_pred_rf) / 2).astype(int)

# Evaluate the combined model
# Classification report and accuracy score give a sense of overall performance
print("Combined Model Classification Report:\n", classification_report(y_test, combined_pred))
print("Combined Model Accuracy:", accuracy_score(y_test, combined_pred))

5. Model Validation

Validate the models using cross-validation and other metrics.

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Cross-validation for logistic regression
# Cross-validation helps ensure the model generalizes well to unseen data
cv_scores_lr = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='accuracy')
print("Logistic Regression Cross-Validation Scores:", cv_scores_lr)
print("Logistic Regression Mean CV Score:", cv_scores_lr.mean())

# Cross-validation for random forest
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='accuracy')
print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Random Forest Mean CV Score:", cv_scores_rf.mean())

# Confusion matrix for combined model
# Confusion matrix provides detailed insights into the true positive, false positive, true negative, and false negative counts
conf_matrix = confusion_matrix(y_test, combined_pred)
print("Combined Model Confusion Matrix:\n", conf_matrix)

# Additional metrics for combined model
# These metrics provide a comprehensive evaluation of the model performance
accuracy = accuracy_score(y_test, combined_pred)
precision = precision_score(y_test, combined_pred, average='weighted')
recall = recall_score(y_test, combined_pred, average='weighted')
f1 = f1_score(y_test, combined_pred, average='weighted')

print(f"Combined Model Accuracy: {accuracy}")
print(f"Combined Model Precision: {precision}")
print(f"Combined Model Recall: {recall}")
print(f"Combined Model F1 Score: {f1}")

6. Backtesting and Real-Time Implementation

Backtest the hybrid model on historical data to validate its performance. Implement the model in a real-time trading system.

# Placeholder for backtesting logic
# Backtesting involves running the model on historical data to simulate trading and evaluate performance
def backtest_model(model, data):
    # Implement backtesting logic
    pass

# Backtest hybrid model
backtest_model(combined_pred, X_test)

# Real-time trading implementation
# Real-time trading involves using live market data to make trading decisions
def real_time_trading(model, live_data):
    # Implement real-time trading logic
    pass

# Implement real-time trading logic
real_time_trading(combined_pred, X_test)