Add an notebook

wgouyang · wgouyang · commit 9d01911db289 · 2021-03-16T14:41:12.000+08:00
diff --git a/kaggle-summary-quora-insincere-questions-classification.ipynb b/kaggle-summary-quora-insincere-questions-classification.ipynb
@@ -0,0 +1,281 @@
+{
+ "metadata": {
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.2-final"
+  },
+  "orig_nbformat": 2,
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3.8.2 64-bit",
+   "metadata": {
+    "interpreter": {
+     "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2,
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install --user --upgrade pip \n",
+    "!pip3 install --user nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import nltk\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('punkt')\n",
+    "nltk.download('averaged_perceptron_tagger')\n",
+    "nltk.download('gutenberg')\n",
+    "nltk.download('stopwords')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from nltk.tokenize import word_tokenize\n",
+    "from nltk.tokenize import sent_tokenize \n",
+    "\n",
+    "corpus = '''Tokenization is the process of tokenizing or splitting a string, text into a list of tokens. One can think of token as parts like a word is a token in a sentence, and a sentence is a token in a paragraph.'''\n",
+    "\n",
+    "print(sent_tokenize(corpus))\n",
+    "print(word_tokenize(corpus))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import WordNetLemmatizer\n",
+    "\n",
+    "corpus = ['rocks', 'gone', 'better']\n",
+    "lemmatizer = WordNetLemmatizer()\n",
+    "\n",
+    "print([lemmatizer.lemmatize(w) for w in corpus])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk import pos_tag\n",
+    "\n",
+    "def lemmatize_sent(text): \n",
+    "    pos_dict = {'NN':'n', 'JJ':'a', 'VB':'v', 'RB':'r'}\n",
+    "    word_list = []\n",
+    "    for word, tag in pos_tag(word_tokenize(text)):\n",
+    "        pos = pos_dict[tag[0:2]] if tag[0:2] in pos_dict else 'n'\n",
+    "        word_list.append(lemmatizer.lemmatize(word, pos=pos))\n",
+    "    return word_list\n",
+    "\n",
+    "sentence = 'He is walking to school'\n",
+    "print('lemmatize word by word: ', [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)])\n",
+    "print('lemmatize with context: ', lemmatize_sent(sentence))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.stem import PorterStemmer\n",
+    "\n",
+    "corpus = ['rocks', 'going', 'history']\n",
+    "stemmer = PorterStemmer()\n",
+    "print([stemmer.stem(w) for w in corpus])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import gutenberg\n",
+    "import time\n",
+    " \n",
+    "def timing(func):\n",
+    "    def decorate(*args, **kwargs):\n",
+    "        start = time.time()\n",
+    "        func(*args, **kwargs)\n",
+    "        print(\"%-30s: %-7.2f ms\" % (func.__name__, (time.time() - start) * 1000))\n",
+    "    return decorate\n",
+    "\n",
+    "@timing\n",
+    "def stemming(text):\n",
+    "    [stemmer.stem(w) for w in word_tokenize(sentence)]\n",
+    "\n",
+    "@timing\n",
+    "def lemmatize(text):\n",
+    "    lemmatize_sent(text)\n",
+    "\n",
+    "@timing\n",
+    "def lemmatize_without_context(text):  \n",
+    "    [lemmatizer.lemmatize(w) for w in word_tokenize(sentence)]\n",
+    "\n",
+    "book = gutenberg.raw(\"austen-sense.txt\")\n",
+    "\n",
+    "stemming(book)\n",
+    "lemmatize(book)\n",
+    "lemmatize_without_context(book)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "corpus = ['I', 'am', 'a', 'boy']\n",
+    "print([w for w in corpus if w not in set(stopwords.words('english'))])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install --user scikit-learn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import CountVectorizer\n",
+    "\n",
+    "vectorizer = CountVectorizer()\n",
+    "corpus = [\n",
+    "    'He is a teacher',\n",
+    "    'I am student',\n",
+    "    'She is also a student',\n",
+    "]\n",
+    "X = vectorizer.fit_transform(corpus)\n",
+    "\n",
+    "print(vectorizer.get_feature_names())\n",
+    "print(list(vectorizer.stop_words_))\n",
+    "print(X.toarray())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "vectorizer = TfidfVectorizer()\n",
+    "corpus = [\n",
+    "    'He is a teacher',\n",
+    "    'I am student',\n",
+    "    'She is also a student',\n",
+    "]\n",
+    "X = vectorizer.fit_transform(corpus)\n",
+    "\n",
+    "print(vectorizer.get_feature_names())\n",
+    "print(X.toarray())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install --user numpy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "\n",
+    "#The vector calculation of the sentence He is a teacher.\n",
+    "tfidf_he = 1/3 * (np.log((3+1)/(1+1))+1)\n",
+    "tfidf_is = 1/3 * (np.log((3+1)/(2+1))+1)\n",
+    "tfidf_teacher = 1/3 * (np.log((3+1)/(1+1))+1)\n",
+    "\n",
+    "print(np.divide([tfidf_he, tfidf_is, tfidf_teacher], np.sqrt(tfidf_he*tfidf_he + tfidf_is*tfidf_is + tfidf_teacher* tfidf_teacher)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install --user gensim"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gensim.downloader\n",
+    "from gensim.models import Word2Vec\n",
+    "\n",
+    "word2vec = gensim.downloader.load('word2vec-google-news-300')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(word2vec.most_similar('car'))\n",
+    "print(word2vec.word_vec('car'))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install tensorflow"
+   ]
+  }
+ ]
+}