SHAP KernelExplainer error on textual data using pipeline

up vote
0
down vote

favorite

I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.

I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!

Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import os

import re

import nltk



#Load the data

os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')

review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')



#Clean the data

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer



def clean_text(df_text_column, data):   

    corpus = 

    for i in range(0, len(data)):

        text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])

        text = text.lower()

        text = text.split()

        ps = PorterStemmer()

        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]

        text = ' '.join(text)

        corpus.append(text)

    return corpus



X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']

y = review['Liked']



# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)



# Creating the pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer() 

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

from sklearn.pipeline import make_pipeline

np.random.seed(0)

rf_pipe = make_pipeline(vect, rf)

rf_pipe.steps

rf_pipe.fit(X_train, y_train)



y_pred = rf_pipe.predict(X_test)

y_prob = rf_pipe.predict_proba(X_test)



#Performance Metrics

from sklearn import metrics

metrics.accuracy_score(y_test, y_pred) #Accuracy

metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score



# use Kernel SHAP to explain test set predictions

import shap

explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")

shap_values = explainer.shap_values(X_test, nsamples=100)



# plot the SHAP values

shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")

asked yesterday

Lacri Mosa

377

add a comment |

up vote
0
down vote

favorite

I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!

Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import os

import re

import nltk



#Load the data

os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')

review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')



#Clean the data

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer



def clean_text(df_text_column, data):   

    corpus = 

    for i in range(0, len(data)):

        text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])

        text = text.lower()

        text = text.split()

        ps = PorterStemmer()

        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]

        text = ' '.join(text)

        corpus.append(text)

    return corpus



X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']

y = review['Liked']



# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)



# Creating the pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer() 

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

from sklearn.pipeline import make_pipeline

np.random.seed(0)

rf_pipe = make_pipeline(vect, rf)

rf_pipe.steps

rf_pipe.fit(X_train, y_train)



y_pred = rf_pipe.predict(X_test)

y_prob = rf_pipe.predict_proba(X_test)



#Performance Metrics

from sklearn import metrics

metrics.accuracy_score(y_test, y_pred) #Accuracy

metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score



# use Kernel SHAP to explain test set predictions

import shap

explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")

shap_values = explainer.shap_values(X_test, nsamples=100)



# plot the SHAP values

shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")

asked yesterday

Lacri Mosa

377

add a comment |

up vote
0
down vote

favorite

I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!

Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import os

import re

import nltk



#Load the data

os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')

review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')



#Clean the data

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer



def clean_text(df_text_column, data):   

    corpus = 

    for i in range(0, len(data)):

        text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])

        text = text.lower()

        text = text.split()

        ps = PorterStemmer()

        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]

        text = ' '.join(text)

        corpus.append(text)

    return corpus



X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']

y = review['Liked']



# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)



# Creating the pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer() 

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

from sklearn.pipeline import make_pipeline

np.random.seed(0)

rf_pipe = make_pipeline(vect, rf)

rf_pipe.steps

rf_pipe.fit(X_train, y_train)



y_pred = rf_pipe.predict(X_test)

y_prob = rf_pipe.predict_proba(X_test)



#Performance Metrics

from sklearn import metrics

metrics.accuracy_score(y_test, y_pred) #Accuracy

metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score



# use Kernel SHAP to explain test set predictions

import shap

explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")

shap_values = explainer.shap_values(X_test, nsamples=100)



# plot the SHAP values

shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")

asked yesterday

Lacri Mosa

377

I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!

Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

import os

import re

import nltk



#Load the data

os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')

review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')



#Clean the data

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.stem.porter import PorterStemmer



def clean_text(df_text_column, data):   

    corpus = 

    for i in range(0, len(data)):

        text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])

        text = text.lower()

        text = text.split()

        ps = PorterStemmer()

        text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]

        text = ' '.join(text)

        corpus.append(text)

    return corpus



X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']

y = review['Liked']



# Splitting the dataset into the Training set and Test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)



# Creating the pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer() 

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

from sklearn.pipeline import make_pipeline

np.random.seed(0)

rf_pipe = make_pipeline(vect, rf)

rf_pipe.steps

rf_pipe.fit(X_train, y_train)



y_pred = rf_pipe.predict(X_test)

y_prob = rf_pipe.predict_proba(X_test)



#Performance Metrics

from sklearn import metrics

metrics.accuracy_score(y_test, y_pred) #Accuracy

metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score



# use Kernel SHAP to explain test set predictions

import shap

explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")

shap_values = explainer.shap_values(X_test, nsamples=100)



# plot the SHAP values

shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")

python pipe pipeline

asked yesterday

Lacri Mosa

377

asked yesterday

Lacri Mosa

377

asked yesterday

Lacri Mosa

377

asked yesterday

Lacri Mosa

377

asked yesterday

Lacri Mosa

377

add a comment |

active

oldest

votes

Your Answer

StackExchange.ifUsing("editor", function () {
StackExchange.using("externalEditor", function () {
StackExchange.using("snippets", function () {
StackExchange.snippets.init();
});
});
}, "code-snippets");

StackExchange.ready(function() {
var channelOptions = {
tags: "".split(" "),
id: "1"
};
initTagRenderer("".split(" "), "".split(" "), channelOptions);

StackExchange.using("externalEditor", function() {
// Have to fire editor after snippets, if snippets enabled
if (StackExchange.settings.snippets.snippetsEnabled) {
StackExchange.using("snippets", function() {
createEditor();
});
}
else {
createEditor();
}
});

function createEditor() {
StackExchange.prepareEditor({
heartbeatType: 'answer',
convertImagesToLinks: true,
noModals: true,
showLowRepImageUploadWarning: true,
reputationToPostImages: 10,
bindNavPrevention: true,
postfix: "",
imageUploader: {
brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
allowUrls: true
},
onDemand: true,
discardSelector: ".discard-answer"
,immediatelyShowMarkdownHelp:true
});

}
});

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53183945%2fshap-kernelexplainer-error-on-textual-data-using-pipeline%23new-answer', 'question_page');
}
);

Post as a guest

Name

active

oldest

votes

draft saved

draft discarded

draft saved

draft discarded

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Post as a guest

Name

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Sign up or log in

StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});

Name

This page is only for reference, If you need detailed information, please check here

搜尋此網誌

Zystkmt