%%capture
!pip install lxml
import pandas as pd
pd.options.display.float_format = '{:,.2f}'.format
[1]
, since the Wikipedia menu is [0]
dowjones = pd.read_html("https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average")[1]
dowjones.head(5)
Company | Exchange | Symbol | Industry | Date added | Notes | Index weighting | |
---|---|---|---|---|---|---|---|
0 | 3M | NYSE | MMM | Conglomerate | 1976-08-09 | As Minnesota Mining and Manufacturing | 2.41% |
1 | American Express | NYSE | AXP | Financial services | 1982-08-30 | NaN | 3.02% |
2 | Amgen | NASDAQ | AMGN | Biopharmaceutical | 2020-08-31 | NaN | 5.48% |
3 | Apple | NASDAQ | AAPL | Information technology | 2015-03-19 | NaN | 2.84% |
4 | Boeing | NYSE | BA | Aerospace and defense | 1987-03-12 | NaN | 3.36% |
dowjones = dowjones.iloc[:, :5].copy()
dowjones.head(5)
dowjones.rename(columns = {'Company': 'company', 'Exchange': 'exchange', 'Symbol': 'symbol', 'Industry': 'industry', 'Date added': 'date_added'}, inplace = True)
dowjones.sample(5)
company | exchange | symbol | industry | date_added | |
---|---|---|---|---|---|
0 | 3M | NYSE | MMM | Conglomerate | 1976-08-09 |
23 | Travelers | NYSE | TRV | Insurance | 2009-06-08 |
28 | Walmart | NYSE | WMT | Retailing | 1997-03-17 |
5 | Caterpillar | NYSE | CAT | Construction and Mining | 1991-05-06 |
26 | Visa | NYSE | V | Financial services | 2013-09-20 |
date_added
to datetime objects¶dowjones.date_added = pd.to_datetime(dowjones.date_added)
dowjones.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 30 entries, 0 to 29 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 company 30 non-null object 1 exchange 30 non-null object 2 symbol 30 non-null object 3 industry 30 non-null object 4 date_added 30 non-null datetime64[ns] dtypes: datetime64[ns](1), object(4) memory usage: 1.3+ KB
dowjones.symbol[0:5]
0 MMM 1 AXP 2 AMGN 3 AAPL 4 BA Name: symbol, dtype: object
import unicodedata
dowjones.symbol = dowjones.symbol.apply(lambda x: unicodedata.normalize("NFKD", x))
dowjones.head(5)
company | exchange | symbol | industry | date_added | |
---|---|---|---|---|---|
0 | 3M | NYSE | MMM | Conglomerate | 1976-08-09 |
1 | American Express | NYSE | AXP | Financial services | 1982-08-30 |
2 | Amgen | NASDAQ | AMGN | Biopharmaceutical | 2020-08-31 |
3 | Apple | NASDAQ | AAPL | Information technology | 2015-03-19 |
4 | Boeing | NYSE | BA | Aerospace and defense | 1987-03-12 |
dowjones["ticker"] = dowjones.symbol.str.split(": ").apply(lambda x: x[-1])
symbol_list = dowjones.symbol.to_list()
dowjones.to_csv('dowjones.csv', index = False)
%%capture
!pip install yfinance
import yfinance as yf
yf.download("^DJI").head(5)
[*********************100%***********************] 1 of 1 completed
Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|
Date | ||||||
1992-01-02 | 3,152.10 | 3,172.63 | 3,139.31 | 3,172.40 | 3,172.40 | 23550000 |
1992-01-03 | 3,172.40 | 3,210.64 | 3,165.92 | 3,201.50 | 3,201.50 | 23620000 |
1992-01-06 | 3,201.50 | 3,213.33 | 3,191.86 | 3,200.10 | 3,200.10 | 27280000 |
1992-01-07 | 3,200.10 | 3,210.20 | 3,184.48 | 3,204.80 | 3,204.80 | 25510000 |
1992-01-08 | 3,204.80 | 3,229.20 | 3,185.82 | 3,203.90 | 3,203.90 | 29040000 |
dj_data = yf.download("^DJI", start = '2007-01-01', end = '2020-03-31')
dj_data.head(5)
[*********************100%***********************] 1 of 1 completed
Open | High | Low | Close | Adj Close | Volume | |
---|---|---|---|---|---|---|
Date | ||||||
2007-01-03 | 12,459.54 | 12,580.35 | 12,404.82 | 12,474.52 | 12,474.52 | 327200000 |
2007-01-04 | 12,473.16 | 12,510.41 | 12,403.86 | 12,480.69 | 12,480.69 | 259060000 |
2007-01-05 | 12,480.05 | 12,480.13 | 12,365.41 | 12,398.01 | 12,398.01 | 235220000 |
2007-01-08 | 12,392.01 | 12,445.92 | 12,337.37 | 12,423.49 | 12,423.49 | 223500000 |
2007-01-09 | 12,424.77 | 12,466.43 | 12,369.17 | 12,416.60 | 12,416.60 | 225190000 |
dj_data.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 3333 entries, 2007-01-03 to 2020-03-30 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Open 3333 non-null float64 1 High 3333 non-null float64 2 Low 3333 non-null float64 3 Close 3333 non-null float64 4 Adj Close 3333 non-null float64 5 Volume 3333 non-null int64 dtypes: float64(5), int64(1) memory usage: 182.3 KB
dj_data.to_csv("dj_data.csv")
symbol_list[0:5]
['MMM', 'AXP', 'AMGN', 'AAPL', 'BA']
prices = yf.download(symbol_list, start = '2007-01-01', end = '2020-03-03')
[*********************100%***********************] 30 of 30 completed
prices.head(5)
Adj Close | ... | Volume | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
AAPL | AMGN | AXP | BA | CAT | CRM | CSCO | CVX | DIS | DOW | ... | MRK | MSFT | NKE | PG | TRV | UNH | V | VZ | WBA | WMT | |
Date | |||||||||||||||||||||
2007-01-03 | 2.55 | 51.27 | 46.99 | 64.41 | 39.57 | 9.02 | 19.69 | 38.56 | 28.32 | NaN | ... | 15740226 | 76935100 | 17299200 | 9717900 | 3432800 | 8360300 | NaN | 21445850 | 6294500 | 35687300 |
2007-01-04 | 2.61 | 53.47 | 46.64 | 64.67 | 39.47 | 9.47 | 20.21 | 38.18 | 28.54 | NaN | ... | 13115930 | 45774500 | 15085600 | 8711400 | 2068200 | 5152500 | NaN | 19215860 | 3681800 | 17073000 |
2007-01-05 | 2.59 | 53.60 | 46.03 | 64.39 | 38.96 | 9.88 | 20.22 | 38.33 | 28.31 | NaN | ... | 11168431 | 44607200 | 14996800 | 9907900 | 2104600 | 6215700 | NaN | 19047041 | 3680900 | 13556900 |
2007-01-08 | 2.60 | 53.17 | 46.46 | 64.24 | 39.01 | 9.98 | 20.33 | 38.82 | 28.57 | NaN | ... | 7384522 | 50220200 | 10109600 | 11068200 | 2440900 | 4344100 | NaN | 20370917 | 4720800 | 16396400 |
2007-01-09 | 2.82 | 53.42 | 46.17 | 63.56 | 39.22 | 9.99 | 20.22 | 38.37 | 28.52 | NaN | ... | 9037114 | 44636600 | 15167200 | 10823800 | 1319500 | 5483900 | NaN | 16281352 | 3792500 | 14643200 |
5 rows × 180 columns
prices.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 3313 entries, 2007-01-03 to 2020-03-02 Columns: 180 entries, ('Adj Close', 'AAPL') to ('Volume', 'WMT') dtypes: float64(152), int64(28) memory usage: 4.6 MB
prices
as all the data from the Close
columns for each symbol¶prices = prices.loc[:,"Close"].copy()
prices
AAPL | AMGN | AXP | BA | CAT | CRM | CSCO | CVX | DIS | DOW | ... | MRK | MSFT | NKE | PG | TRV | UNH | V | VZ | WBA | WMT | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Date | |||||||||||||||||||||
2007-01-03 | 2.99 | 68.40 | 60.36 | 89.17 | 61.16 | 9.02 | 27.73 | 70.97 | 33.74 | NaN | ... | 42.00 | 29.86 | 12.21 | 64.54 | 53.55 | 52.57 | NaN | 35.31 | 46.07 | 47.55 |
2007-01-04 | 3.06 | 71.33 | 59.92 | 89.53 | 61.00 | 9.47 | 28.46 | 70.28 | 34.00 | NaN | ... | 43.04 | 29.81 | 12.33 | 64.05 | 53.10 | 52.91 | NaN | 35.50 | 46.16 | 47.78 |
2007-01-05 | 3.04 | 71.50 | 59.13 | 89.15 | 60.22 | 9.88 | 28.47 | 70.55 | 33.73 | NaN | ... | 42.27 | 29.64 | 12.35 | 63.50 | 52.41 | 52.55 | NaN | 34.90 | 45.50 | 47.39 |
2007-01-08 | 3.05 | 70.93 | 59.69 | 88.94 | 60.29 | 9.98 | 28.63 | 71.45 | 34.03 | NaN | ... | 42.26 | 29.93 | 12.32 | 63.64 | 52.02 | 53.32 | NaN | 34.36 | 45.69 | 47.00 |
2007-01-09 | 3.31 | 71.27 | 59.31 | 88.00 | 60.62 | 9.99 | 28.47 | 70.63 | 33.98 | NaN | ... | 41.87 | 29.96 | 12.47 | 63.48 | 51.89 | 52.68 | NaN | 34.50 | 45.93 | 47.39 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2020-02-25 | 72.02 | 210.38 | 120.90 | 304.14 | 129.00 | 181.27 | 42.74 | 100.71 | 128.19 | 43.95 | ... | 76.59 | 168.07 | 92.90 | 121.43 | 130.79 | 263.39 | 188.40 | 57.12 | 47.51 | 114.39 |
2020-02-26 | 73.16 | 214.54 | 118.50 | 305.59 | 128.25 | 178.87 | 42.16 | 98.04 | 123.36 | 43.05 | ... | 76.28 | 170.17 | 92.00 | 120.16 | 128.56 | 263.00 | 187.21 | 57.14 | 47.67 | 113.78 |
2020-02-27 | 68.38 | 203.50 | 112.81 | 287.76 | 123.27 | 172.15 | 40.04 | 94.13 | 118.04 | 39.55 | ... | 74.50 | 158.18 | 88.55 | 113.50 | 124.08 | 253.92 | 180.01 | 55.06 | 46.24 | 110.40 |
2020-02-28 | 68.34 | 199.73 | 109.93 | 275.11 | 124.24 | 170.40 | 39.93 | 93.34 | 117.65 | 40.41 | ... | 73.05 | 162.01 | 89.38 | 113.23 | 119.81 | 254.96 | 181.76 | 54.16 | 45.76 | 107.68 |
2020-03-02 | 74.70 | 212.20 | 113.87 | 289.27 | 127.60 | 176.76 | 41.17 | 96.59 | 119.98 | 41.42 | ... | 77.64 | 172.79 | 92.68 | 119.56 | 127.68 | 273.11 | 192.33 | 57.32 | 48.11 | 115.88 |
3313 rows × 30 columns
prices.info()
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 3313 entries, 2007-01-03 to 2020-03-02 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 AAPL 3313 non-null float64 1 AMGN 3313 non-null float64 2 AXP 3313 non-null float64 3 BA 3313 non-null float64 4 CAT 3313 non-null float64 5 CRM 3313 non-null float64 6 CSCO 3313 non-null float64 7 CVX 3313 non-null float64 8 DIS 3313 non-null float64 9 DOW 240 non-null float64 10 GS 3313 non-null float64 11 HD 3313 non-null float64 12 HON 3313 non-null float64 13 IBM 3313 non-null float64 14 INTC 3313 non-null float64 15 JNJ 3313 non-null float64 16 JPM 3313 non-null float64 17 KO 3313 non-null float64 18 MCD 3313 non-null float64 19 MMM 3313 non-null float64 20 MRK 3313 non-null float64 21 MSFT 3313 non-null float64 22 NKE 3313 non-null float64 23 PG 3313 non-null float64 24 TRV 3313 non-null float64 25 UNH 3313 non-null float64 26 V 3009 non-null float64 27 VZ 3313 non-null float64 28 WBA 3313 non-null float64 29 WMT 3313 non-null float64 dtypes: float64(30) memory usage: 802.4 KB
prices.to_csv('prices.csv')