From 7a2e114158c1486dc57e39be02822a559b4fa6ae Mon Sep 17 00:00:00 2001 From: AndresSantamaria99 <145497969+AndresSantamaria99@users.noreply.github.com> Date: Mon, 3 Jun 2024 02:22:42 +0200 Subject: [PATCH] Update caso_analitica_nlp_twitter.ipynb MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Se agregan los imports, funciones y solución del primer punto --- caso02/caso_analitica_nlp_twitter.ipynb | 214 +++++++++++++++++++++++- 1 file changed, 209 insertions(+), 5 deletions(-) diff --git a/caso02/caso_analitica_nlp_twitter.ipynb b/caso02/caso_analitica_nlp_twitter.ipynb index 8fff05f..9ea0225 100644 --- a/caso02/caso_analitica_nlp_twitter.ipynb +++ b/caso02/caso_analitica_nlp_twitter.ipynb @@ -68,6 +68,18 @@ "source": [ "# Librerías\n", "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import textstat\n", + "import seaborn as sns\n", + "from textblob import TextBlob\n", + "import re\n", + "import nltk\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import word_tokenize\n", + "from nltk.stem import WordNetLemmatizer\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.cluster import KMeans\n", + "\n", "\n", "from caso02.descomprimir_dataset import unzip_dataset\n" ], @@ -86,22 +98,177 @@ "tweets_df = pd.read_csv(csv_dataset_path)\n", "tweets_df.describe()\n", "\n", + "#Descarga de recursos necesarios de NLTK\n", + "nltk.download('punkt')\n", + "nltk.download('stopwords')\n", + "nltk.download('wordnet')\n", + "nltk.download('omw-1.4')\n", + "nltk.download('words')\n", "# Procesamiento del dataframe\n" ], "outputs": [], "execution_count": null }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "---\n", + "\n", + "## Pre-Procesamiento de los datos" + ], + "id": "d70ef5b6ce59e327" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Asignamos nombres a columnas del Dataset", + "id": "fbe0c7ef0075c9fb" + }, { "cell_type": "code", "id": "9de5b053e6aeaf8f", "metadata": {}, "source": [ "# Otros Códigos\n", + "tweets_df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']\n", + "tweets_df.head()\n" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": "#### Eliminamos la Columna 'flag' debido a que no aporta información relevante", + "id": "65b603e0f1dfdac1" + }, + { + "metadata": {}, + "cell_type": "code", + "source": "tweets_df = tweets_df.drop('flag', axis=1)", + "id": "eae4983ef8e6ccd4", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "#### Realizamos una limpieza de los datos utilizando técnicas de conglomerado. Cada paso está numerado en su respectiva función, como se muestra a continuación:\n", + "1. Eliminar caracteres de puntuación, espacios adicionales, dígitos u otros caracteres que puedan entorpecer\n", + "el análisis textual\n", + "2. Tokenizar y eliminar Stopwords. Se requiere un diccionario de palabras para quitar aquellas que puedan\n", + "entorpecer el análisis textual. Por ejemplo, se puede utilizar “from nltk.corpus import stopwords”. Ejemplo:\n", + "NLTK stop words - Python Tutorial (pythonspot.com)\n", + "3. Encontrar la raíz de las palabras aplicando lemmatization o stemming.\n", + "4. Aplicar vectorizado del tokenizado para calcular apariciones de los tokens y cuantificar los tweets. Se\n", + "pueden usar distintos cálculos, por ejemplo Bag-of-Words, Word2Vec, o TFIDF con “from\n", + "sklearn.feature_extraction.text import TfidfVectorizer”\n", + "5. Aplicar clustering con técnicas adecuadas. Por ejemplo, Kmeans previo cálculo del número de clusters con\n", + "técnicas como Elbow." + ], + "id": "2d23dbe9b7f72cc4" + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "#1.\n", + "def limpieza_Datos(text):\n", + " text = text.lower()\n", + " text = re.sub(r'\\d+', '', text) \n", + " text = re.sub(r'\\s+', ' ', text)\n", + " text = re.sub(r'[^\\w\\s]', '', text)\n", + " text = text.strip() \n", + " return text\n", + "\n", + "#Aplicamos la funcion Limpieza de Datos en el Dataset\n", + "tweets_df['text'] = tweets_df['text'].apply(limpieza_Datos)\n", + "\n", + "#Printamos los Datos para visualizar la limpieza\n", + "tweets_df.head(100)" + ], + "id": "bf55d8f1b2aa4018", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "#2.\n", + "stop_words = set(stopwords.words('english'))\n", + "def tokenizar_y_eliminar_Stopwords(text):\n", + " tokens = word_tokenize(text)\n", + " tokens_filtrados = [word for word in tokens if word not in stop_words]\n", + " return tokens_filtrados\n", "\n", - "resultado = df_Twitter['Busqueda'].unique()\n", - " \n", - "print(resultado)" + "#Aplicamos la función sobre el dataset\n", + "tweets_df['tokens'] = tweets_df['text'].apply(tokenizar_y_eliminar_Stopwords)" ], + "id": "d649c2554b461d10", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "#3.\n", + "lemmatizer = WordNetLemmatizer()\n", + "\n", + "def lemmatize_tokens(tokens):\n", + " lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]\n", + " return lemmatized_tokens\n", + "\n", + "#Aplicamos la función sobre el dataset\n", + "tweets_df['lemmatized_tokens'] = tweets_df['tokens'].apply(lemmatize_tokens)" + ], + "id": "bcf40ecaf9425934", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "#4.\n", + "tweets_df['processed_text'] = tweets_df['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))\n", + "vectorizer = TfidfVectorizer(max_features=1000)\n", + "X = vectorizer.fit_transform(tweets_df['processed_text'])" + ], + "id": "f9b95f76be1e7734", + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "#5.\n", + "def metodo_elbow(X):\n", + " wcss = []\n", + " for i in range(1, 11):\n", + " kmeans = KMeans(n_clusters=i, max_iter=300, n_init=10, random_state=0)\n", + " kmeans.fit(X)\n", + " wcss.append(kmeans.inertia_)\n", + " plt.plot(range(1, 11), wcss)\n", + " plt.title('Método del codo')\n", + " plt.xlabel('Número de clusters')\n", + " plt.ylabel('WCSS')\n", + " plt.show()\n", + "\n", + "metodo_elbow(X)\n", + "\n", + "kmeans = KMeans(n_clusters=3, max_iter=300, n_init=10, random_state=0)\n", + "clusters = kmeans.fit_predict(X)\n", + "\n", + "tweets_df['cluster'] = clusters\n", + "\n", + "tweets_df.head()\n" + ], + "id": "48c78b7461b78f9d", "outputs": [], "execution_count": null }, @@ -132,9 +299,46 @@ "id": "12ef9e4b76081904", "metadata": {}, "source": [ - "# Respuesta\n", - "print(\"Respuestas\")" + "#1.a\n", + "\n", + "def calcular_polarity(text):\n", + " blob = TextBlob(text)\n", + " return blob.sentiment.polarity * 5\n", + "\n", + "tweets_df['polarity'] = tweets_df['text'].apply(calcular_polarity)\n", + "\n", + "polarity_counts = tweets_df['polarity'].value_counts(bins=10, sort=False)\n", + "print(polarity_counts)\n", + "\n", + "polarity_counts.plot(kind='bar', title='Distribución de las Polaridades de los Tweets')\n", + "plt.xlabel('Polaridad')\n", + "plt.ylabel('Cantidad de Tweets')\n", + "plt.show()\n" + ], + "outputs": [], + "execution_count": null + }, + { + "metadata": {}, + "cell_type": "code", + "source": [ + "#1.b\n", + "\n", + "def calcular_readability(text):\n", + " return textstat.flesch_kincaid_grade(text)\n", + "\n", + "tweets_df['readability'] = tweets_df['text'].apply(calcular_readability)\n", + "\n", + "plt.figure(figsize=(10, 6))\n", + "sns.boxplot(x=pd.cut(tweets_df['polarity'], bins=10), y='readability', data=tweets_df)\n", + "plt.title('Relación entre Polaridad y Complejidad de Lectura de los Tweets')\n", + "plt.xlabel('Polaridad')\n", + "plt.ylabel('Complejidad de Lectura (Flesch-Kincaid Grade Level)')\n", + "plt.show()\n", + "\n", + "print(tweets_df.head(100))\n" ], + "id": "15df0463515bc6aa", "outputs": [], "execution_count": null },