#Import relevant packages
import yfinance as yf
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

# Download IVV data
ivv = yf.download(
    tickers="IVV",
    start="2001-01-01",
    end="2024-12-31",
    auto_adjust=True,
    progress=False
)

ivv.to_csv("IVV_2001_2024_raw.csv") #Transform to .CSV file

df = pd.read_csv("IVV_2001_2024_raw.csv", skiprows=2)
df.columns = ["Date", "Close", "High", "Low", "Open", "Volume"]
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df = df.dropna(subset=["Date"])
df.head()

# Plot daily adjusted close
plt.figure(figsize=(16, 9))
df["Close"].plot(title="Figure 1. IVV Daily Close Price (2001–2024)", lw=1.3)
plt.xlabel("Year")
plt.ylabel("Price (USD)")
plt.tight_layout()
plt.show()

#Compute statistics, EDA
df_temp = df.copy()
df_temp["Date"] = pd.to_datetime(df_temp["Date"], errors="coerce")
df_temp = df_temp.dropna(subset=["Date"])
df_temp = df_temp.set_index("Date")
close = df_temp["Close"]
ivv_m = close.resample("ME").last()  # Monthly end price
ret_d = close.pct_change().dropna()
ret_m = ivv_m.pct_change().dropna()

summary = pd.DataFrame({
    "Observation (daily)": [len(df_temp)],
    "Annual mean return (%)": [((1 + ret_d.mean())**252 - 1) * 100],
    "Annual volatility (%)": [ret_d.std() * (252**0.5) * 100],
    "Worst month return (%)": [ret_m.min() * 100],
    "Best month return (%)": [ret_m.max() * 100]
}).round(2)
summary

#Generate a duplicate dataframe
data = df.copy()

# Even Better SineWave
data["EvenBetterSineWave"] = np.sin(np.linspace(0, np.pi * 2 * len(data) / 200, len(data)))

# Balance of Power (BOP) ---
data["BOP"] = (data["Close"] - data["Open"]) / (data["High"] - data["Low"])

# Stochastic RSI (14-period) ---
rsi_window = 14
delta = data["Close"].diff()
gain = delta.clip(lower=0)
loss = -delta.clip(upper=0)
avg_gain = gain.rolling(rsi_window).mean()
avg_loss = loss.rolling(rsi_window).mean()
rs = avg_gain / avg_loss
rsi = 100 - (100 / (1 + rs))
data["StochRSI_K"] = (rsi - rsi.rolling(14).min()) / (rsi.rolling(14).max() - rsi.rolling(14).min())

# Correlation Trend Indicator 
data["CorrTrend"] = data["Close"].rolling(30).corr(pd.Series(range(len(data))))

# KDJ (K, D, J) ---
low_n = data["Low"].rolling(14).min()
high_n = data["High"].rolling(14).max()
data["K"] = 100 * (data["Close"] - low_n) / (high_n - low_n)
data["D"] = data["K"].rolling(3).mean()
data["J"] = 3 * data["K"] - 2 * data["D"]

# Williams %R (14) ---
data["WilliamsR"] = -100 * (high_n - data["Close"]) / (high_n - low_n)

# Z-Score of Close (20) ---
rolling_mean = data["Close"].rolling(20).mean()
rolling_std = data["Close"].rolling(20).std()
data["ZScore"] = (data["Close"] - rolling_mean) / rolling_std

# Decreasing / Increasing Flags ---
data["DEC"] = np.where(data["Close"].diff() < 0, 1, 0)
data["INC"] = np.where(data["Close"].diff() > 0, 1, 0)

# TTM Trend (5) ---
data["TTM_Trend"] = np.where(data["Close"] > data["Close"].rolling(5).mean(), 1, 0)

# Bollinger Bands Percent (BBP) ---
bb_mid = data["Close"].rolling(20).mean()
bb_std = data["Close"].rolling(20).std()
bb_upper = bb_mid + 2 * bb_std
bb_lower = bb_mid - 2 * bb_std
data["BBP"] = (data["Close"] - bb_lower) / (bb_upper - bb_lower)

# Average On-Balance Volume (AOBV) ---
obv = [0]
for i in range(1, len(data)):
    if data["Close"].iloc[i] > data["Close"].iloc[i-1]:
        obv.append(obv[-1] + data["Volume"].iloc[i])
    elif data["Close"].iloc[i] < data["Close"].iloc[i-1]:
        obv.append(obv[-1] - data["Volume"].iloc[i])
    else:
        obv.append(obv[-1])
data["AOBV"] = pd.Series(obv).rolling(10).mean()

# Price-Volume Rank (PVR) ---
p_change = data["Close"].pct_change()
v_change = data["Volume"].pct_change()
conditions = [
    (p_change > 0) & (v_change > 0),
    (p_change > 0) & (v_change < 0),
    (p_change < 0) & (v_change > 0),
    (p_change < 0) & (v_change < 0)
]
choices = [1, 2, 3, 4]
data["PVR"] = np.select(conditions, choices, default=np.nan)

features = [
    "EvenBetterSineWave", "BOP", "StochRSI_K", "CorrTrend", "K", "D", "J",
    "WilliamsR", "ZScore", "DEC", "INC", "TTM_Trend", "BBP", "AOBV", "PVR"
]

#Preview the data with close price
data[features + ["Close"]].head(50)

# Define binary target Γ_t = 1 if next day open > today open
data["Target"] = np.where(data["Open"].shift(-1) > data["Open"], 1, 0)
data = data.dropna(subset=["Target"]).reset_index(drop=True)

# Feature matrix and target
X = data[features]
y = data["Target"].astype(int).values

# Standardize features (z-score)
X = (X - X.mean()) / X.std()

# Compute Pearson correlations between each feature and the target
corrs = []
for col in X.columns:
    valid_idx = X[col].notna()
    if valid_idx.sum() > 2: 
        corr = np.corrcoef(X.loc[valid_idx, col], y[valid_idx])[0, 1]
    else:
        corr = np.nan
    corrs.append({"Feature": col, "Correlation_with_Target": round(corr, 4) if pd.notna(corr) else np.nan})

corr_df = pd.DataFrame(corrs).sort_values(by="Correlation_with_Target", ascending=False)
print("\n=== Correlation of Each Feature with Target ===")
print(corr_df)

=== Correlation of Each Feature with Target ===
               Feature  Correlation_with_Target
1                  BOP                   0.6708
10                 INC                   0.4584
6                    J                   0.4353
11           TTM_Trend                   0.3338
4                    K                   0.2776
7            WilliamsR                   0.2776
8               ZScore                   0.2465
12                 BBP                   0.2465
2           StochRSI_K                   0.1827
5                    D                   0.0901
13                AOBV                   0.0293
3            CorrTrend                   0.0209
0   EvenBetterSineWave                  -0.0141
14                 PVR                  -0.3958
9                  DEC                  -0.4567

# K-Fold Cross-Validation (10 folds)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold

# --- Xử lý missing values ---
imputer = SimpleImputer(strategy="mean")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# --- K-Fold ---
kf = KFold(n_splits=10, shuffle=True, random_state=42)
accuracies = []

for train_idx, test_idx in kf.split(X):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    model = LogisticRegression(max_iter=500)
    model.fit(X_train, y_train)

    preds = np.where(model.predict_proba(X_test)[:, 1] > 0.5, 1, 0)
    acc = (preds == y_test).mean()
    accuracies.append(acc)

print(f"\nMean CV Accuracy: {np.mean(accuracies)*100:.2f}% (± {np.std(accuracies)*100:.2f}%)")

Mean CV Accuracy: 80.70% (± 2.00%)

# Cross-validation summary
cv_summary = pd.DataFrame({
    "Metric": ["Mean Accuracy (%)", "Std Accuracy (%)"],
    "Value": [np.mean(accuracies)*100, np.std(accuracies)*100]
}).round(2)
print("\n=== Cross-Validation Summary ===")
print(cv_summary)

# Plot correlation of top 10 features
plt.figure(figsize=(8, 5))
plt.barh(corr_df["Feature"].head(10), corr_df["Correlation_with_Target"].head(10))
plt.title("Figure 2. Top 10 Feature–Target Correlations")
plt.xlabel("Correlation with Target")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

# Plot cross-validation accuracy distribution
plt.figure(figsize=(6, 4))
plt.boxplot(accuracies, vert=False, patch_artist=True)
plt.title("Figure 3. 10-Fold Cross-Validation Accuracy (Logistic Regression)")
plt.xlabel("Accuracy")
plt.tight_layout()
plt.show()

=== Cross-Validation Summary ===
              Metric  Value
0  Mean Accuracy (%)   80.7
1   Std Accuracy (%)    2.0

	Date	Close	High	Low	Open	Volume
0	2001-01-02	82.002693	83.974103	81.405296	83.794884	174300
1	2001-01-03	86.084885	86.104798	81.564582	81.713931	194500
2	2001-01-04	85.109085	86.164486	84.830300	85.825961	153500
3	2001-01-05	82.649834	85.198726	82.540311	85.198726	68400
4	2001-01-08	82.779297	82.779297	81.474980	82.440772	167700

	EvenBetterSineWave	BOP	StochRSI_K	CorrTrend	K	D	J	WilliamsR	ZScore	DEC	INC	TTM_Trend	BBP	AOBV	PVR	Close
0	0.000000	-0.697674	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	0	0	NaN	NaN	NaN	82.002693
1	0.031416	0.962719	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	0	NaN	NaN	1.0	86.084885
2	0.062801	-0.537313	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	0	0	NaN	NaN	4.0	85.109085
3	0.094124	-0.958801	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	0	0	NaN	NaN	4.0	82.649834
4	0.125354	0.259542	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	0	NaN	NaN	1.0	82.779297
5	0.156460	-0.433333	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	0	NaN	NaN	2.0	82.799156
6	0.187412	0.954023	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	1	NaN	NaN	2.0	83.964111
7	0.218179	0.475177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	1	NaN	NaN	1.0	84.412155
8	0.248730	-0.366197	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	0	1	NaN	NaN	4.0	84.193130
9	0.279036	0.365591	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	1	NaN	268780.0	2.0	84.541618
10	0.309067	-0.588652	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	1	NaN	340370.0	1.0	84.959816
11	0.338792	0.496933	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	1	1	NaN	399450.0	2.0	85.925560
12	0.368183	-0.761194	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	0	1	NaN	469900.0	4.0	85.626877
13	0.397210	0.247788	NaN	NaN	82.319997	NaN	NaN	-17.680003	NaN	0	1	1	NaN	551140.0	2.0	85.716522
14	0.425845	0.604167	NaN	NaN	93.309250	NaN	NaN	-6.690750	NaN	0	1	1	NaN	619120.0	2.0	86.612617
15	0.454060	0.513158	NaN	NaN	93.760493	89.796580	101.688319	-6.239507	NaN	0	1	1	NaN	694720.0	1.0	87.010811
16	0.481827	-0.355263	NaN	NaN	85.835089	90.968277	75.568714	-14.164911	NaN	1	0	1	NaN	751170.0	4.0	86.542877
17	0.509118	0.337079	NaN	NaN	85.329196	88.308259	79.371069	-14.670804	NaN	1	0	1	NaN	764910.0	4.0	86.513008
18	0.535906	0.820893	NaN	NaN	92.263630	87.809305	101.172279	-7.736370	NaN	0	1	1	NaN	799470.0	2.0	86.980972
19	0.562165	0.602156	NaN	NaN	99.885557	92.492794	114.671083	-0.114443	1.615133	0	1	1	0.903783	830150.0	2.0	87.790283
20	0.587869	-0.468217	NaN	NaN	78.856939	90.335375	55.900067	-21.143061	1.200464	1	0	1	0.800116	820000.0	3.0	87.235847
21	0.612993	0.616884	NaN	NaN	90.740361	89.827619	92.565845	-9.259639	1.427373	0	1	1	0.856843	809410.0	2.0	87.796623
22	0.637512	-0.913330	NaN	NaN	50.288001	73.295100	4.273802	-49.711999	0.314028	1	0	0	0.578507	797950.0	4.0	86.018761
23	0.661402	0.619042	NaN	NaN	46.893704	62.640689	15.399735	-53.106296	0.399716	0	1	0	0.599929	785490.0	2.0	86.286385
24	0.684638	-0.314518	NaN	NaN	44.852891	47.344865	39.868943	-55.147109	0.330454	0	1	0	0.582613	771360.0	2.0	86.305504
25	0.707199	-0.447756	NaN	NaN	15.495255	35.747284	-25.008801	-84.504745	-0.269909	1	0	0	0.432523	731530.0	3.0	85.700142
26	0.729061	-0.875622	NaN	NaN	0.000000	20.116049	-40.232098	-100.000000	-1.075621	1	0	0	0.231095	692550.0	3.0	84.909958
27	0.750204	-0.715792	0.000000	NaN	7.627003	7.707419	7.466169	-92.372997	-1.812764	1	0	0	0.046809	660610.0	4.0	84.049721
28	0.770606	0.895109	0.000000	NaN	25.848049	11.158351	55.227447	-74.151951	-1.146534	0	1	0	0.213366	620180.0	2.0	84.871773
29	0.790248	-0.511238	0.000000	0.472638	15.395992	16.290348	13.607281	-84.604008	-1.574033	1	0	0	0.106492	570230.0	3.0	84.400223
30	0.809109	-0.329334	0.008487	0.316349	14.583408	18.609150	6.531924	-85.416592	-1.759883	1	0	0	0.060029	548150.0	4.0	84.036980
31	0.827171	0.425284	0.110988	0.320467	29.166875	19.715425	48.069776	-70.833125	-1.040667	0	1	1	0.239833	525040.0	1.0	84.750679
32	0.844417	-0.065691	0.000000	0.212955	8.160865	17.303716	-10.124838	-91.839135	-2.048956	1	0	0	-0.012239	483640.0	3.0	83.125786
33	0.860829	-0.847582	0.000000	-0.028010	3.076984	13.468241	-17.705530	-96.923016	-2.400381	1	0	0	-0.100095	434950.0	4.0	81.793953
34	0.876392	-0.656564	0.000000	-0.277537	1.135437	4.124429	-4.842545	-98.864563	-2.597350	1	0	0	-0.149338	377540.0	3.0	80.111702
35	0.891089	-0.123077	0.000000	-0.475079	14.068941	6.093788	30.019248	-85.931059	-2.191603	1	0	0	-0.047901	295920.0	3.0	79.939659
36	0.904907	0.115607	0.046052	-0.597497	19.024725	11.409701	34.254773	-80.975275	-1.997126	1	0	0	0.000718	202500.0	4.0	79.493591
37	0.917831	0.551280	0.239755	-0.679062	34.134333	22.409333	57.584333	-65.865667	-1.295961	0	1	1	0.176010	121100.0	2.0	80.895454
38	0.929849	-0.257732	0.293092	-0.758132	34.111575	29.090211	44.154302	-65.888425	-1.259661	1	0	1	0.185085	-6060.0	3.0	80.615105
39	0.940950	-0.779944	0.258383	-0.822670	15.738268	27.994725	-8.774645	-84.261732	-1.637792	1	0	0	0.090552	-139330.0	4.0	79.060303
40	0.951121	0.196850	0.452931	-0.870157	21.512596	23.787480	16.962829	-78.487404	-1.380344	0	1	0	0.154914	-263340.0	2.0	79.359779
41	0.960353	0.298611	0.479212	-0.896994	13.865566	17.038810	7.519078	-86.134434	-1.457898	1	0	0	0.135526	-396640.0	4.0	78.779907
42	0.968638	0.530309	0.418527	-0.922636	21.597110	18.991757	26.807816	-78.402890	-1.125549	0	1	0	0.218613	-460580.0	1.0	79.366188
43	0.975966	-0.091503	0.698471	-0.937852	35.371690	23.611456	58.892160	-64.628310	-0.696391	0	1	1	0.325902	-380350.0	1.0	80.309273
44	0.982330	-0.172039	0.815474	-0.937186	39.913129	32.293977	55.151434	-60.086871	-0.491771	0	1	1	0.377057	-191960.0	2.0	80.640625
45	0.987725	0.261905	0.865821	-0.927525	54.107222	43.130681	76.060306	-45.892778	-0.314730	0	1	1	0.421317	36570.0	2.0	80.876373
46	0.992145	-0.771186	0.786751	-0.933840	17.028772	37.016374	-22.946433	-82.971228	-1.203039	1	0	0	0.199240	77600.0	3.0	78.677956
47	0.995585	-0.970853	0.547592	-0.936772	1.220969	24.118988	-44.575068	-98.779031	-2.309102	1	0	0	-0.077275	-88740.0	4.0	75.200119
48	0.998042	0.334645	1.000000	-0.940590	23.579264	13.943002	42.851789	-76.420736	-1.632565	0	1	0	0.091859	-199150.0	2.0	76.438820
49	0.999515	-0.048110	0.789781	-0.939410	8.768278	11.189504	3.925827	-91.231722	-2.070699	1	0	0	-0.017675	-314960.0	3.0	74.612755

Financial Data Project #2: Using Neural Networks and Optimized Technical Indicators for Stock Trend Prediction: A Case Study on the iShares Core S&P500 ETF (IVV)¶

Introduction¶

Daily price plot¶

Feature Selection¶

Feature Correlation with Next-Day Market Direction¶

Multilayer Perceptron Model Training and K-Fold Cross-validation¶

Result Summary¶

Conclusion¶