1#------------------------------------------------------------------------------
2# accept a dataframe, remove outliers, return cleaned data in a new dataframe
3# see http://www.itl.nist.gov/div898/handbook/prc/section1/prc16.htm
4#------------------------------------------------------------------------------
5def remove_outlier(df_in, col_name):
6 q1 = df_in[col_name].quantile(0.25)
7 q3 = df_in[col_name].quantile(0.75)
8 iqr = q3-q1 #Interquartile range
9 fence_low = q1-1.5*iqr
10 fence_high = q3+1.5*iqr
11 df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
12 return df_out
13
1cols = ['col_1', 'col_2'] # one or more
2
3Q1 = df[cols].quantile(0.25)
4Q3 = df[cols].quantile(0.75)
5IQR = Q3 - Q1
6
7df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
8
1cols = ['col_1', 'col_2'] # one or more
2
3Q1 = df[cols].quantile(0.25)
4Q3 = df[cols].quantile(0.75)
5IQR = Q3 - Q1
6
7df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]
1df = pd.DataFrame(np.random.randn(100, 3))
2
3from scipy import stats
4df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]
5
1df[(df["col"] >= x ) & (df["col"] <= y )]
2
3but it's more readable to use:
4
5df[df["col"].between(x,y)]