SHAP KernelExplainer error on textual data using pipeline

Multi tool use
Multi tool use











up vote
0
down vote

favorite












I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.



I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.



AttributeError: 'numpy.ndarray' object has no attribute 'lower'


Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!



Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing



import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import re
import nltk

#Load the data
os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')

#Clean the data
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def clean_text(df_text_column, data):
corpus =
for i in range(0, len(data)):
text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
text = ' '.join(text)
corpus.append(text)
return corpus

X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
y = review['Liked']

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

# Creating the pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
from sklearn.pipeline import make_pipeline
np.random.seed(0)
rf_pipe = make_pipeline(vect, rf)
rf_pipe.steps
rf_pipe.fit(X_train, y_train)

y_pred = rf_pipe.predict(X_test)
y_prob = rf_pipe.predict_proba(X_test)

#Performance Metrics
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred) #Accuracy
metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score

# use Kernel SHAP to explain test set predictions
import shap
explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
shap_values = explainer.shap_values(X_test, nsamples=100)

# plot the SHAP values
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")









share|improve this question


























    up vote
    0
    down vote

    favorite












    I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.



    I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.



    AttributeError: 'numpy.ndarray' object has no attribute 'lower'


    Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!



    Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing



    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt
    import os
    import re
    import nltk

    #Load the data
    os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
    review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')

    #Clean the data
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer

    def clean_text(df_text_column, data):
    corpus =
    for i in range(0, len(data)):
    text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
    text = ' '.join(text)
    corpus.append(text)
    return corpus

    X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
    y = review['Liked']

    # Splitting the dataset into the Training set and Test set
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    # Creating the pipeline
    from sklearn.feature_extraction.text import TfidfVectorizer
    vect = TfidfVectorizer()
    from sklearn.ensemble import RandomForestClassifier
    rf = RandomForestClassifier()
    from sklearn.pipeline import make_pipeline
    np.random.seed(0)
    rf_pipe = make_pipeline(vect, rf)
    rf_pipe.steps
    rf_pipe.fit(X_train, y_train)

    y_pred = rf_pipe.predict(X_test)
    y_prob = rf_pipe.predict_proba(X_test)

    #Performance Metrics
    from sklearn import metrics
    metrics.accuracy_score(y_test, y_pred) #Accuracy
    metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score

    # use Kernel SHAP to explain test set predictions
    import shap
    explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
    shap_values = explainer.shap_values(X_test, nsamples=100)

    # plot the SHAP values
    shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")









    share|improve this question
























      up vote
      0
      down vote

      favorite









      up vote
      0
      down vote

      favorite











      I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.



      I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.



      AttributeError: 'numpy.ndarray' object has no attribute 'lower'


      Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!



      Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing



      import numpy as np
      import pandas as pd
      import matplotlib.pyplot as plt
      import os
      import re
      import nltk

      #Load the data
      os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
      review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')

      #Clean the data
      nltk.download('stopwords')
      from nltk.corpus import stopwords
      from nltk.stem.porter import PorterStemmer

      def clean_text(df_text_column, data):
      corpus =
      for i in range(0, len(data)):
      text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
      text = text.lower()
      text = text.split()
      ps = PorterStemmer()
      text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
      text = ' '.join(text)
      corpus.append(text)
      return corpus

      X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
      y = review['Liked']

      # Splitting the dataset into the Training set and Test set
      from sklearn.model_selection import train_test_split
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

      # Creating the pipeline
      from sklearn.feature_extraction.text import TfidfVectorizer
      vect = TfidfVectorizer()
      from sklearn.ensemble import RandomForestClassifier
      rf = RandomForestClassifier()
      from sklearn.pipeline import make_pipeline
      np.random.seed(0)
      rf_pipe = make_pipeline(vect, rf)
      rf_pipe.steps
      rf_pipe.fit(X_train, y_train)

      y_pred = rf_pipe.predict(X_test)
      y_prob = rf_pipe.predict_proba(X_test)

      #Performance Metrics
      from sklearn import metrics
      metrics.accuracy_score(y_test, y_pred) #Accuracy
      metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score

      # use Kernel SHAP to explain test set predictions
      import shap
      explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
      shap_values = explainer.shap_values(X_test, nsamples=100)

      # plot the SHAP values
      shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")









      share|improve this question













      I was looking through the SHAP package for Python and I found no examples using KernelExplainer to explain textual data predictions so I decided to test it out using the dataset i found on https://www.superdatascience.com/machine-learning/.



      I encountered a problem in the KernelExplainer part at the last bit, where I believe the problem is the way I input the data and model into the explainer.



      AttributeError: 'numpy.ndarray' object has no attribute 'lower'


      Can anyone advise me on what I should revise so as to make the explainer work? I spent hours on this last bit but to no avail. Any help or advice is greatly appreciated. With much thanks!



      Dataset: https://drive.google.com/file/d/1-pzY7IQVyB_GmT5dT0yRx3hYzOFGrZSr/view?usp=sharing



      import numpy as np
      import pandas as pd
      import matplotlib.pyplot as plt
      import os
      import re
      import nltk

      #Load the data
      os.chdir('C:\Users\Win\Desktop\MyLearning\Explainability\SHAP')
      review = pd.read_csv('Restaurant_Reviews.tsv', sep='t')

      #Clean the data
      nltk.download('stopwords')
      from nltk.corpus import stopwords
      from nltk.stem.porter import PorterStemmer

      def clean_text(df_text_column, data):
      corpus =
      for i in range(0, len(data)):
      text = re.sub('[^a-zA-Z]', ' ', df_text_column[i])
      text = text.lower()
      text = text.split()
      ps = PorterStemmer()
      text = [ps.stem(word) for word in text if not word in set(stopwords.words('english'))]
      text = ' '.join(text)
      corpus.append(text)
      return corpus

      X = pd.DataFrame({'Review':clean_text(review['Review'],review)})['Review']
      y = review['Liked']

      # Splitting the dataset into the Training set and Test set
      from sklearn.model_selection import train_test_split
      X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

      # Creating the pipeline
      from sklearn.feature_extraction.text import TfidfVectorizer
      vect = TfidfVectorizer()
      from sklearn.ensemble import RandomForestClassifier
      rf = RandomForestClassifier()
      from sklearn.pipeline import make_pipeline
      np.random.seed(0)
      rf_pipe = make_pipeline(vect, rf)
      rf_pipe.steps
      rf_pipe.fit(X_train, y_train)

      y_pred = rf_pipe.predict(X_test)
      y_prob = rf_pipe.predict_proba(X_test)

      #Performance Metrics
      from sklearn import metrics
      metrics.accuracy_score(y_test, y_pred) #Accuracy
      metrics.roc_auc_score(y_test, y_prob[:, 1]) #ROC-AUC score

      # use Kernel SHAP to explain test set predictions
      import shap
      explainer = shap.KernelExplainer(rf_pipe.predict_proba, X_train, link="logit")
      shap_values = explainer.shap_values(X_test, nsamples=100)

      # plot the SHAP values
      shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], X_test.iloc[0,:], link="logit")






      python pipe pipeline






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked yesterday









      Lacri Mosa

      377




      377





























          active

          oldest

          votes











          Your Answer






          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "1"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          convertImagesToLinks: true,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: 10,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














           

          draft saved


          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53183945%2fshap-kernelexplainer-error-on-textual-data-using-pipeline%23new-answer', 'question_page');
          }
          );

          Post as a guest





































          active

          oldest

          votes













          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















           

          draft saved


          draft discarded



















































           


          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fstackoverflow.com%2fquestions%2f53183945%2fshap-kernelexplainer-error-on-textual-data-using-pipeline%23new-answer', 'question_page');
          }
          );

          Post as a guest




















































































          nz5,ZdBCQ
          YQCaP,rpPvR1t7Jr

          Popular posts from this blog

          横浜市

          Rostock

          Europa