Skip to content

Commit

Permalink
Update caso_analitica_nlp_twitter.ipynb
Browse files Browse the repository at this point in the history
Se agregan los imports, funciones y solución del primer punto
  • Loading branch information
AndresSantamaria99 committed Jun 3, 2024
1 parent 9fcbbfd commit 7a2e114
Showing 1 changed file with 209 additions and 5 deletions.
214 changes: 209 additions & 5 deletions caso02/caso_analitica_nlp_twitter.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,18 @@
"source": [
"# Librerías\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import textstat\n",
"import seaborn as sns\n",
"from textblob import TextBlob\n",
"import re\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import word_tokenize\n",
"from nltk.stem import WordNetLemmatizer\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.cluster import KMeans\n",
"\n",
"\n",
"from caso02.descomprimir_dataset import unzip_dataset\n"
],
Expand All @@ -86,22 +98,177 @@
"tweets_df = pd.read_csv(csv_dataset_path)\n",
"tweets_df.describe()\n",
"\n",
"#Descarga de recursos necesarios de NLTK\n",
"nltk.download('punkt')\n",
"nltk.download('stopwords')\n",
"nltk.download('wordnet')\n",
"nltk.download('omw-1.4')\n",
"nltk.download('words')\n",
"# Procesamiento del dataframe\n"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"---\n",
"\n",
"## Pre-Procesamiento de los datos"
],
"id": "d70ef5b6ce59e327"
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#### Asignamos nombres a columnas del Dataset",
"id": "fbe0c7ef0075c9fb"
},
{
"cell_type": "code",
"id": "9de5b053e6aeaf8f",
"metadata": {},
"source": [
"# Otros Códigos\n",
"tweets_df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']\n",
"tweets_df.head()\n"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": "#### Eliminamos la Columna 'flag' debido a que no aporta información relevante",
"id": "65b603e0f1dfdac1"
},
{
"metadata": {},
"cell_type": "code",
"source": "tweets_df = tweets_df.drop('flag', axis=1)",
"id": "eae4983ef8e6ccd4",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "markdown",
"source": [
"#### Realizamos una limpieza de los datos utilizando técnicas de conglomerado. Cada paso está numerado en su respectiva función, como se muestra a continuación:\n",
"1. Eliminar caracteres de puntuación, espacios adicionales, dígitos u otros caracteres que puedan entorpecer\n",
"el análisis textual\n",
"2. Tokenizar y eliminar Stopwords. Se requiere un diccionario de palabras para quitar aquellas que puedan\n",
"entorpecer el análisis textual. Por ejemplo, se puede utilizar “from nltk.corpus import stopwords”. Ejemplo:\n",
"NLTK stop words - Python Tutorial (pythonspot.com)\n",
"3. Encontrar la raíz de las palabras aplicando lemmatization o stemming.\n",
"4. Aplicar vectorizado del tokenizado para calcular apariciones de los tokens y cuantificar los tweets. Se\n",
"pueden usar distintos cálculos, por ejemplo Bag-of-Words, Word2Vec, o TFIDF con “from\n",
"sklearn.feature_extraction.text import TfidfVectorizer”\n",
"5. Aplicar clustering con técnicas adecuadas. Por ejemplo, Kmeans previo cálculo del número de clusters con\n",
"técnicas como Elbow."
],
"id": "2d23dbe9b7f72cc4"
},
{
"metadata": {},
"cell_type": "code",
"source": [
"#1.\n",
"def limpieza_Datos(text):\n",
" text = text.lower()\n",
" text = re.sub(r'\\d+', '', text) \n",
" text = re.sub(r'\\s+', ' ', text)\n",
" text = re.sub(r'[^\\w\\s]', '', text)\n",
" text = text.strip() \n",
" return text\n",
"\n",
"#Aplicamos la funcion Limpieza de Datos en el Dataset\n",
"tweets_df['text'] = tweets_df['text'].apply(limpieza_Datos)\n",
"\n",
"#Printamos los Datos para visualizar la limpieza\n",
"tweets_df.head(100)"
],
"id": "bf55d8f1b2aa4018",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"#2.\n",
"stop_words = set(stopwords.words('english'))\n",
"def tokenizar_y_eliminar_Stopwords(text):\n",
" tokens = word_tokenize(text)\n",
" tokens_filtrados = [word for word in tokens if word not in stop_words]\n",
" return tokens_filtrados\n",
"\n",
"resultado = df_Twitter['Busqueda'].unique()\n",
" \n",
"print(resultado)"
"#Aplicamos la función sobre el dataset\n",
"tweets_df['tokens'] = tweets_df['text'].apply(tokenizar_y_eliminar_Stopwords)"
],
"id": "d649c2554b461d10",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"#3.\n",
"lemmatizer = WordNetLemmatizer()\n",
"\n",
"def lemmatize_tokens(tokens):\n",
" lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]\n",
" return lemmatized_tokens\n",
"\n",
"#Aplicamos la función sobre el dataset\n",
"tweets_df['lemmatized_tokens'] = tweets_df['tokens'].apply(lemmatize_tokens)"
],
"id": "bcf40ecaf9425934",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"#4.\n",
"tweets_df['processed_text'] = tweets_df['lemmatized_tokens'].apply(lambda tokens: ' '.join(tokens))\n",
"vectorizer = TfidfVectorizer(max_features=1000)\n",
"X = vectorizer.fit_transform(tweets_df['processed_text'])"
],
"id": "f9b95f76be1e7734",
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"#5.\n",
"def metodo_elbow(X):\n",
" wcss = []\n",
" for i in range(1, 11):\n",
" kmeans = KMeans(n_clusters=i, max_iter=300, n_init=10, random_state=0)\n",
" kmeans.fit(X)\n",
" wcss.append(kmeans.inertia_)\n",
" plt.plot(range(1, 11), wcss)\n",
" plt.title('Método del codo')\n",
" plt.xlabel('Número de clusters')\n",
" plt.ylabel('WCSS')\n",
" plt.show()\n",
"\n",
"metodo_elbow(X)\n",
"\n",
"kmeans = KMeans(n_clusters=3, max_iter=300, n_init=10, random_state=0)\n",
"clusters = kmeans.fit_predict(X)\n",
"\n",
"tweets_df['cluster'] = clusters\n",
"\n",
"tweets_df.head()\n"
],
"id": "48c78b7461b78f9d",
"outputs": [],
"execution_count": null
},
Expand Down Expand Up @@ -132,9 +299,46 @@
"id": "12ef9e4b76081904",
"metadata": {},
"source": [
"# Respuesta\n",
"print(\"Respuestas\")"
"#1.a\n",
"\n",
"def calcular_polarity(text):\n",
" blob = TextBlob(text)\n",
" return blob.sentiment.polarity * 5\n",
"\n",
"tweets_df['polarity'] = tweets_df['text'].apply(calcular_polarity)\n",
"\n",
"polarity_counts = tweets_df['polarity'].value_counts(bins=10, sort=False)\n",
"print(polarity_counts)\n",
"\n",
"polarity_counts.plot(kind='bar', title='Distribución de las Polaridades de los Tweets')\n",
"plt.xlabel('Polaridad')\n",
"plt.ylabel('Cantidad de Tweets')\n",
"plt.show()\n"
],
"outputs": [],
"execution_count": null
},
{
"metadata": {},
"cell_type": "code",
"source": [
"#1.b\n",
"\n",
"def calcular_readability(text):\n",
" return textstat.flesch_kincaid_grade(text)\n",
"\n",
"tweets_df['readability'] = tweets_df['text'].apply(calcular_readability)\n",
"\n",
"plt.figure(figsize=(10, 6))\n",
"sns.boxplot(x=pd.cut(tweets_df['polarity'], bins=10), y='readability', data=tweets_df)\n",
"plt.title('Relación entre Polaridad y Complejidad de Lectura de los Tweets')\n",
"plt.xlabel('Polaridad')\n",
"plt.ylabel('Complejidad de Lectura (Flesch-Kincaid Grade Level)')\n",
"plt.show()\n",
"\n",
"print(tweets_df.head(100))\n"
],
"id": "15df0463515bc6aa",
"outputs": [],
"execution_count": null
},
Expand Down

0 comments on commit 7a2e114

Please sign in to comment.