SHAP KernelExplainer error on textual data using pipeline

Multi tool use
up vote
0
down vote
favorite
I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.
I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!
Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
#Load the data
os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')
#Clean the data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def clean_text(df_text_column, data):
corpus =
for i in range(0, len(data)):
text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)
corpus.append(text)
return corpus
X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
y = review['Liked']
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Creating the pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.pipeline import make_pipeline
np.random.seed(0)
rf_pipe = make_pipeline(vect, rf)
rf_pipe.steps
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
y_prob = rf_pipe.predict_proba(X_test)
#Performance Metrics
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred) #Accuracy
metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score
# use Kernel SHAP to explain test set predictions
import shap
explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
shap_values = explainer.shap_values(X_test, nsamples=100)
# plot the SHAP values
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")
python pipe pipeline
add a comment |
up vote
0
down vote
favorite
I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.
I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!
Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
#Load the data
os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')
#Clean the data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def clean_text(df_text_column, data):
corpus =
for i in range(0, len(data)):
text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)
corpus.append(text)
return corpus
X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
y = review['Liked']
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Creating the pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.pipeline import make_pipeline
np.random.seed(0)
rf_pipe = make_pipeline(vect, rf)
rf_pipe.steps
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
y_prob = rf_pipe.predict_proba(X_test)
#Performance Metrics
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred) #Accuracy
metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score
# use Kernel SHAP to explain test set predictions
import shap
explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
shap_values = explainer.shap_values(X_test, nsamples=100)
# plot the SHAP values
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")
python pipe pipeline
add a comment |
up vote
0
down vote
favorite
up vote
0
down vote
favorite
I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.
I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!
Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
#Load the data
os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')
#Clean the data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def clean_text(df_text_column, data):
corpus =
for i in range(0, len(data)):
text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)
corpus.append(text)
return corpus
X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
y = review['Liked']
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Creating the pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.pipeline import make_pipeline
np.random.seed(0)
rf_pipe = make_pipeline(vect, rf)
rf_pipe.steps
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
y_prob = rf_pipe.predict_proba(X_test)
#Performance Metrics
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred) #Accuracy
metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score
# use Kernel SHAP to explain test set predictions
import shap
explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
shap_values = explainer.shap_values(X_test, nsamples=100)
# plot the SHAP values
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")
python pipe pipeline
I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.
I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.
AttributeError: 'numpy.ndarray' object has no attribute 'lower'
Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!
Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk
#Load the data
os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')
#Clean the data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
def clean_text(df_text_column, data):
corpus =
for i in range(0, len(data)):
text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)
corpus.append(text)
return corpus
X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
y = review['Liked']
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# Creating the pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.pipeline import make_pipeline
np.random.seed(0)
rf_pipe = make_pipeline(vect, rf)
rf_pipe.steps
rf_pipe.fit(X_train, y_train)
y_pred = rf_pipe.predict(X_test)
y_prob = rf_pipe.predict_proba(X_test)
#Performance Metrics
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred) #Accuracy
metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score
# use Kernel SHAP to explain test set predictions
import shap
explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
shap_values = explainer.shap_values(X_test, nsamples=100)
# plot the SHAP values
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")
python pipe pipeline
python pipe pipeline
asked yesterday
Lacri Mosa
377
377
add a comment |
add a comment |
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
active
oldest
votes
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53183945%2fshap-kernelexplainer-error-on-textual-data-using-pipeline%23new-answer', 'question_page');
}
);
Post as a guest
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
nz5,ZdBCQ