Commit 13ae1c62 by Yolanda Nainggolan

adding preprocessing page and editing dataframe page

parent 804be444
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Kelompok 3 | Search Engine with Inverted Index Simulator Based on Billboard Songs Collection\n",
" - 12S16003 Maria H. Siallagan\n",
" - 12S16026 Yolanda Nainggolan\n",
" - 12S16036 Prima Hutapea\n",
" - 12S16049 Rosa Delima Mendrofa"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'dataset_STBI.xml'",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32m<ipython-input-1-0410f424fcaa>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0msklearn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mfeature_extraction\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mCountVectorizer\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 4\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mminidom\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 5\u001b[1;33m \u001b[0mdcmnt_xml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mminidom\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"dataset_STBI.xml\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\minidom.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, parser, bufsize)\u001b[0m\n\u001b[0;32m 1956\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mparser\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mbufsize\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1957\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1958\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mexpatbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparse\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1959\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1960\u001b[0m \u001b[1;32mfrom\u001b[0m \u001b[0mxml\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdom\u001b[0m \u001b[1;32mimport\u001b[0m \u001b[0mpulldom\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;32m~\\Anaconda3\\lib\\xml\\dom\\expatbuilder.py\u001b[0m in \u001b[0;36mparse\u001b[1;34m(file, namespaces)\u001b[0m\n\u001b[0;32m 908\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 909\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 910\u001b[1;33m \u001b[1;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'rb'\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0mfp\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 911\u001b[0m \u001b[0mresult\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mbuilder\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mparseFile\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfp\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 912\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
"\u001b[1;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'dataset_STBI.xml'"
]
}
],
"source": [
"import string\n",
"import re\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"import xml.dom.minidom as minidom\n",
"dcmnt_xml = minidom.parse(\"dataset_STBI.xml\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"all_doc_no = dcmnt_xml.getElementsByTagName('DOCNO')\n",
"all_profile = dcmnt_xml.getElementsByTagName('SONG')\n",
"all_date = dcmnt_xml.getElementsByTagName('ARTIST')\n",
"all_text = dcmnt_xml.getElementsByTagName('LYRICS')\n",
"all_pub = dcmnt_xml.getElementsByTagName('PUB')\n",
"all_page = dcmnt_xml.getElementsByTagName('PAGE')\n",
"\n",
"N_DOC_sample = len(all_doc_no)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"all_sentence_doc_sample = []\n",
"for i in range(N_DOC_sample):\n",
" sentence_doc_sample = ' '+ all_text[i].firstChild.data\n",
" all_sentence_doc_sample.append(sentence_doc_sample)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Preprocessing "
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"tokens_doc = []"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def remove_punc_tokenize(sentence):\n",
" tokens = []\n",
" for punctuation in string.punctuation:\n",
" sentence = sentence.replace(punctuation,\" \")\n",
" \n",
" sentence = re.sub(r'^https?:\\/\\/.*[\\r\\n]*', '', sentence, flags=re.MULTILINE)\n",
" for w in CountVectorizer().build_tokenizer()(sentence):\n",
" tokens.append(w)\n",
" return tokens"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"for i in range(N_DOC):\n",
" tokens_doc.append(remove_punc_tokenize(all_sentence_doc_sample[i]))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"from nltk.corpus import stopwords\n",
"stop_words = set(stopwords.words('english'))\n",
"def stop_word_token(tokens):\n",
" tokens = [w for w in tokens if not w in stop_words]\n",
" return tokens\n",
"\n",
"for i in range(N_DOC):\n",
" tokens_doc[i] = stop_word_token(tokens_doc[i])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"for i in range(N_DOC):\n",
" tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from nltk.stem import PorterStemmer\n",
"stemmer = PorterStemmer()\n",
"def stemming(tokens):\n",
" for i in range(0, len(tokens)):\n",
" if (tokens[i] != stemmer.stem(tokens[i])):\n",
" tokens[i] = stemmer.stem(tokens[i])\n",
" return tokens\n",
"\n",
"\n",
"for i in range(N_DOC):\n",
" tokens_doc[i] = stemming(tokens_doc[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_tokens = []\n",
"for i in range(N_DOC):\n",
" for w in tokens_doc[i]:\n",
" all_tokens.append(w)\n",
"\n",
"new_sentence = ' '.join([w for w in all_tokens])\n",
"\n",
"for w in CountVectorizer().build_tokenizer()(new_sentence):\n",
" all_tokens.append(w)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_tokens = set(all_tokens)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from itertools import count\n",
"try: \n",
" from itertools import izip as zip\n",
"except ImportError:\n",
" pass\n",
"proximity_index = {}\n",
"for token in all_tokens:\n",
" dict_doc_position = {}\n",
" for n in range(N_DOC):\n",
" if(token in tokens_doc[n]):\n",
" dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]\n",
" proximity_index[token] = dict_doc_position"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import collections\n",
"proximity_index = collections.OrderedDict(sorted(proximity_index.items()))\n",
"for key, value in proximity_index.items():\n",
" print (key, value)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
<SONG> i cant get no satisfaction </SONG> <SONG> i cant get no satisfaction </SONG>
<ARTIST> the rolling stones </ARTIST> <ARTIST> the rolling stones </ARTIST>
<YEAR> 1965 </YEAR> <YEAR> 1965 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -1598,7 +1598,7 @@ ...@@ -1598,7 +1598,7 @@
<SONG> love is a hurtin thing </SONG> <SONG> love is a hurtin thing </SONG>
<ARTIST> lou rawls </ARTIST> <ARTIST> lou rawls </ARTIST>
<YEAR> 1966 </YEAR> <YEAR> 1966 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -1616,7 +1616,7 @@ ...@@ -1616,7 +1616,7 @@
<SONG> gloria </SONG> <SONG> gloria </SONG>
<ARTIST> shadows of knight </ARTIST> <ARTIST> shadows of knight </ARTIST>
<YEAR> 1966 </YEAR> <YEAR> 1966 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -1778,7 +1778,7 @@ ...@@ -1778,7 +1778,7 @@
<SONG> zorba the greek </SONG> <SONG> zorba the greek </SONG>
<ARTIST> herb alpert and the tijuana brass </ARTIST> <ARTIST> herb alpert and the tijuana brass </ARTIST>
<YEAR> 1966 </YEAR> <YEAR> 1966 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -1940,7 +1940,7 @@ ...@@ -1940,7 +1940,7 @@
<SONG> kind of a drag </SONG> <SONG> kind of a drag </SONG>
<ARTIST> the buckinghams </ARTIST> <ARTIST> the buckinghams </ARTIST>
<YEAR> 1967 </YEAR> <YEAR> 1967 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -2399,7 +2399,7 @@ ...@@ -2399,7 +2399,7 @@
<SONG> soul finger </SONG> <SONG> soul finger </SONG>
<ARTIST> the barkays </ARTIST> <ARTIST> the barkays </ARTIST>
<YEAR> 1967 </YEAR> <YEAR> 1967 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -3272,7 +3272,7 @@ ...@@ -3272,7 +3272,7 @@
<SONG> stay in my corner </SONG> <SONG> stay in my corner </SONG>
<ARTIST> the dells </ARTIST> <ARTIST> the dells </ARTIST>
<YEAR> 1968 </YEAR> <YEAR> 1968 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -4217,7 +4217,7 @@ ...@@ -4217,7 +4217,7 @@
<SONG> twentyfive miles </SONG> <SONG> twentyfive miles </SONG>
<ARTIST> edwin starr </ARTIST> <ARTIST> edwin starr </ARTIST>
<YEAR> 1969 </YEAR> <YEAR> 1969 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -4325,7 +4325,7 @@ ...@@ -4325,7 +4325,7 @@
<SONG> tracy </SONG> <SONG> tracy </SONG>
<ARTIST> the cuff links </ARTIST> <ARTIST> the cuff links </ARTIST>
<YEAR> 1969 </YEAR> <YEAR> 1969 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -4415,7 +4415,7 @@ ...@@ -4415,7 +4415,7 @@
<SONG> your good thing is about to end </SONG> <SONG> your good thing is about to end </SONG>
<ARTIST> lou rawls </ARTIST> <ARTIST> lou rawls </ARTIST>
<YEAR> 1969 </YEAR> <YEAR> 1969 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -4559,7 +4559,7 @@ ...@@ -4559,7 +4559,7 @@
<SONG> ill be there </SONG> <SONG> ill be there </SONG>
<ARTIST> the jackson 5 </ARTIST> <ARTIST> the jackson 5 </ARTIST>
<YEAR> 1970 </YEAR> <YEAR> 1970 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -5234,7 +5234,7 @@ ...@@ -5234,7 +5234,7 @@
<SONG> the bells </SONG> <SONG> the bells </SONG>
<ARTIST> the originals </ARTIST> <ARTIST> the originals </ARTIST>
<YEAR> 1970 </YEAR> <YEAR> 1970 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -6233,7 +6233,7 @@ ...@@ -6233,7 +6233,7 @@
<SONG> dont knock my love </SONG> <SONG> dont knock my love </SONG>
<ARTIST> wilson pickett </ARTIST> <ARTIST> wilson pickett </ARTIST>
<YEAR> 1971 </YEAR> <YEAR> 1971 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -6368,7 +6368,7 @@ ...@@ -6368,7 +6368,7 @@
<SONG> baby dont get hooked on me </SONG> <SONG> baby dont get hooked on me </SONG>
<ARTIST> mac davis </ARTIST> <ARTIST> mac davis </ARTIST>
<YEAR> 1972 </YEAR> <YEAR> 1972 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -6755,7 +6755,7 @@ ...@@ -6755,7 +6755,7 @@
<SONG> jungle fever </SONG> <SONG> jungle fever </SONG>
<ARTIST> the chakachas </ARTIST> <ARTIST> the chakachas </ARTIST>
<YEAR> 1972 </YEAR> <YEAR> 1972 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -6935,7 +6935,7 @@ ...@@ -6935,7 +6935,7 @@
<SONG> joy </SONG> <SONG> joy </SONG>
<ARTIST> apollo 100 </ARTIST> <ARTIST> apollo 100 </ARTIST>
<YEAR> 1972 </YEAR> <YEAR> 1972 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -7511,7 +7511,7 @@ ...@@ -7511,7 +7511,7 @@
<SONG> keep on truckin </SONG> <SONG> keep on truckin </SONG>
<ARTIST> eddie kendricks </ARTIST> <ARTIST> eddie kendricks </ARTIST>
<YEAR> 1973 </YEAR> <YEAR> 1973 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -7574,7 +7574,7 @@ ...@@ -7574,7 +7574,7 @@
<SONG> give me love give me peace on earth </SONG> <SONG> give me love give me peace on earth </SONG>
<ARTIST> george harrison </ARTIST> <ARTIST> george harrison </ARTIST>
<YEAR> 1973 </YEAR> <YEAR> 1973 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -8645,7 +8645,7 @@ ...@@ -8645,7 +8645,7 @@
<SONG> hang on in there baby </SONG> <SONG> hang on in there baby </SONG>
<ARTIST> johnny bristol </ARTIST> <ARTIST> johnny bristol </ARTIST>
<YEAR> 1974 </YEAR> <YEAR> 1974 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -10157,7 +10157,7 @@ ...@@ -10157,7 +10157,7 @@
<SONG> theme from swat </SONG> <SONG> theme from swat </SONG>
<ARTIST> rhythm heritage </ARTIST> <ARTIST> rhythm heritage </ARTIST>
<YEAR> 1976 </YEAR> <YEAR> 1976 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -10283,7 +10283,7 @@ ...@@ -10283,7 +10283,7 @@
<SONG> theme from mahogany do you know where youre going to </SONG> <SONG> theme from mahogany do you know where youre going to </SONG>
<ARTIST> diana ross </ARTIST> <ARTIST> diana ross </ARTIST>
<YEAR> 1976 </YEAR> <YEAR> 1976 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -11822,7 +11822,7 @@ ...@@ -11822,7 +11822,7 @@
<SONG> emotion </SONG> <SONG> emotion </SONG>
<ARTIST> samantha sang </ARTIST> <ARTIST> samantha sang </ARTIST>
<YEAR> 1978 </YEAR> <YEAR> 1978 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -11885,7 +11885,7 @@ ...@@ -11885,7 +11885,7 @@
<SONG> feels so good </SONG> <SONG> feels so good </SONG>
<ARTIST> chuck mangione </ARTIST> <ARTIST> chuck mangione </ARTIST>
<YEAR> 1978 </YEAR> <YEAR> 1978 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -12344,7 +12344,7 @@ ...@@ -12344,7 +12344,7 @@
<SONG> because the night </SONG> <SONG> because the night </SONG>
<ARTIST> patti smith group </ARTIST> <ARTIST> patti smith group </ARTIST>
<YEAR> 1978 </YEAR> <YEAR> 1978 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -12452,7 +12452,7 @@ ...@@ -12452,7 +12452,7 @@
<SONG> fool if you think its over </SONG> <SONG> fool if you think its over </SONG>
<ARTIST> chris rea </ARTIST> <ARTIST> chris rea </ARTIST>
<YEAR> 1978 </YEAR> <YEAR> 1978 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -12965,7 +12965,7 @@ ...@@ -12965,7 +12965,7 @@
<SONG> lotta love </SONG> <SONG> lotta love </SONG>
<ARTIST> nicolette larson </ARTIST> <ARTIST> nicolette larson </ARTIST>
<YEAR> 1979 </YEAR> <YEAR> 1979 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -13316,7 +13316,7 @@ ...@@ -13316,7 +13316,7 @@
<SONG> rise </SONG> <SONG> rise </SONG>
<ARTIST> herb alpert </ARTIST> <ARTIST> herb alpert </ARTIST>
<YEAR> 1979 </YEAR> <YEAR> 1979 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -13424,7 +13424,7 @@ ...@@ -13424,7 +13424,7 @@
<SONG> bad case of loving you doctor doctor </SONG> <SONG> bad case of loving you doctor doctor </SONG>
<ARTIST> robert palmer </ARTIST> <ARTIST> robert palmer </ARTIST>
<YEAR> 1979 </YEAR> <YEAR> 1979 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -13982,7 +13982,7 @@ ...@@ -13982,7 +13982,7 @@
<SONG> rise </SONG> <SONG> rise </SONG>
<ARTIST> herb alpert </ARTIST> <ARTIST> herb alpert </ARTIST>
<YEAR> 1980 </YEAR> <YEAR> 1980 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -14504,7 +14504,7 @@ ...@@ -14504,7 +14504,7 @@
<SONG> morning train nine to five </SONG> <SONG> morning train nine to five </SONG>
<ARTIST> sheena easton </ARTIST> <ARTIST> sheena easton </ARTIST>
<YEAR> 1981 </YEAR> <YEAR> 1981 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -14666,7 +14666,7 @@ ...@@ -14666,7 +14666,7 @@
<SONG> the best of times </SONG> <SONG> the best of times </SONG>
<ARTIST> styx </ARTIST> <ARTIST> styx </ARTIST>
<YEAR> 1981 </YEAR> <YEAR> 1981 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -14801,7 +14801,7 @@ ...@@ -14801,7 +14801,7 @@
<SONG> how bout us </SONG> <SONG> how bout us </SONG>
<ARTIST> champaign </ARTIST> <ARTIST> champaign </ARTIST>
<YEAR> 1981 </YEAR> <YEAR> 1981 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -15872,7 +15872,7 @@ ...@@ -15872,7 +15872,7 @@
<SONG> you could have been with me </SONG> <SONG> you could have been with me </SONG>
<ARTIST> sheena easton </ARTIST> <ARTIST> sheena easton </ARTIST>
<YEAR> 1982 </YEAR> <YEAR> 1982 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -16025,7 +16025,7 @@ ...@@ -16025,7 +16025,7 @@
<SONG> i keep forgettin every time youre near </SONG> <SONG> i keep forgettin every time youre near </SONG>
<ARTIST> michael mcdonald </ARTIST> <ARTIST> michael mcdonald </ARTIST>
<YEAR> 1982 </YEAR> <YEAR> 1982 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -17510,7 +17510,7 @@ ...@@ -17510,7 +17510,7 @@
<SONG> the warrior </SONG> <SONG> the warrior </SONG>
<ARTIST> scandal </ARTIST> <ARTIST> scandal </ARTIST>
<YEAR> 1984 </YEAR> <YEAR> 1984 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -19976,7 +19976,7 @@ ...@@ -19976,7 +19976,7 @@
<SONG> at this moment </SONG> <SONG> at this moment </SONG>
<ARTIST> billy vera and the beaters </ARTIST> <ARTIST> billy vera and the beaters </ARTIST>
<YEAR> 1987 </YEAR> <YEAR> 1987 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -20291,7 +20291,7 @@ ...@@ -20291,7 +20291,7 @@
<SONG> songbird </SONG> <SONG> songbird </SONG>
<ARTIST> kenny g </ARTIST> <ARTIST> kenny g </ARTIST>
<YEAR> 1987 </YEAR> <YEAR> 1987 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -21164,7 +21164,7 @@ ...@@ -21164,7 +21164,7 @@
<SONG> whats on your mind pure energy </SONG> <SONG> whats on your mind pure energy </SONG>
<ARTIST> information society </ARTIST> <ARTIST> information society </ARTIST>
<YEAR> 1988 </YEAR> <YEAR> 1988 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -21308,7 +21308,7 @@ ...@@ -21308,7 +21308,7 @@
<SONG> rocket 2 u </SONG> <SONG> rocket 2 u </SONG>
<ARTIST> the jets </ARTIST> <ARTIST> the jets </ARTIST>
<YEAR> 1988 </YEAR> <YEAR> 1988 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -22820,7 +22820,7 @@ ...@@ -22820,7 +22820,7 @@
<SONG> i dont have the heart </SONG> <SONG> i dont have the heart </SONG>
<ARTIST> james ingram </ARTIST> <ARTIST> james ingram </ARTIST>
<YEAR> 1990 </YEAR> <YEAR> 1990 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -23252,7 +23252,7 @@ ...@@ -23252,7 +23252,7 @@
<SONG> oh girl </SONG> <SONG> oh girl </SONG>
<ARTIST> paul young </ARTIST> <ARTIST> paul young </ARTIST>
<YEAR> 1990 </YEAR> <YEAR> 1990 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -25853,7 +25853,7 @@ ...@@ -25853,7 +25853,7 @@
<SONG> forever in love </SONG> <SONG> forever in love </SONG>
<ARTIST> kenny g </ARTIST> <ARTIST> kenny g </ARTIST>
<YEAR> 1993 </YEAR> <YEAR> 1993 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -27374,7 +27374,7 @@ ...@@ -27374,7 +27374,7 @@
<SONG> ill be there for youyoure all i need to get by </SONG> <SONG> ill be there for youyoure all i need to get by </SONG>
<ARTIST> method man featuring mary j blige </ARTIST> <ARTIST> method man featuring mary j blige </ARTIST>
<YEAR> 1995 </YEAR> <YEAR> 1995 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -27851,7 +27851,7 @@ ...@@ -27851,7 +27851,7 @@
<SONG> ill stand by you </SONG> <SONG> ill stand by you </SONG>
<ARTIST> the pretenders </ARTIST> <ARTIST> the pretenders </ARTIST>
<YEAR> 1995 </YEAR> <YEAR> 1995 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -31685,7 +31685,7 @@ ...@@ -31685,7 +31685,7 @@
<SONG> bye bye bye </SONG> <SONG> bye bye bye </SONG>
<ARTIST> n sync </ARTIST> <ARTIST> n sync </ARTIST>
<YEAR> 2000 </YEAR> <YEAR> 2000 </YEAR>
<LYRICS> </LYRICS> <LYRICS> - </LYRICS>
<SOURCE> 1.0 </SOURCE> <SOURCE> 1.0 </SOURCE>
</DOC> </DOC>
<DOC> <DOC>
...@@ -3,16 +3,14 @@ resource_package = __name__ ...@@ -3,16 +3,14 @@ resource_package = __name__
import string import string
import re import re
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer
import string
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.tokenize import sent_tokenize, word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from itertools import count from itertools import count
import collections import collections
import math import math
import xml.etree.ElementTree as et import xml.etree.ElementTree as et
from xml.etree.ElementTree import ElementTree
##############Remove Punctuation, URL and Tokenize################### ##############Remove Punctuation, URL and Tokenize###################
...@@ -50,20 +48,37 @@ def generate_ngrams(data, n): ...@@ -50,20 +48,37 @@ def generate_ngrams(data, n):
return ngram, result return ngram, result
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
def stop_word_token(tokens):
tokens = [w for w in tokens if not w in stop_words]
return tokens
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
def stemming(tokens):
for i in range(0, len(tokens)):
if (tokens[i] != stemmer.stem(tokens[i])):
tokens[i] = stemmer.stem(tokens[i])
return tokens
def main(query): def main(query):
tree = ElementTree() tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml") tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = [] all_doc_no = []
all_headline = [] all_song = []
all_text = [] all_text = []
for node in tree.iter("DOCNO"): for node in tree.iter("DOCNO"):
all_doc_no.append(node.text) all_doc_no.append(node.text)
for node in tree.iter("SONG"): for node in tree.iter("SONG"):
all_headline.append(node.text) all_song.append(node.text)
for node in tree.iter("LYRICS"): for node in tree.iter("LYRICS"):
all_text.append(node.text) all_text.append(node.text)
...@@ -72,7 +87,7 @@ def main(query): ...@@ -72,7 +87,7 @@ def main(query):
all_sentence_doc = [] all_sentence_doc = []
for i in range(N_DOC): for i in range(N_DOC):
all_sentence_doc.append(all_headline[i] + all_text[i]) all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = [] tokens_doc = []
for i in range(N_DOC): for i in range(N_DOC):
tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i])) tokens_doc.append(remove_punc_tokenize(all_sentence_doc[i]))
...@@ -189,7 +204,7 @@ def main(query): ...@@ -189,7 +204,7 @@ def main(query):
score*=idf[i] #tf * idf score*=idf[i] #tf * idf
idx = all_doc_no[i] idx = all_doc_no[i]
judul = all_headline[i] judul = all_song[i]
dic['docno'] = idx dic['docno'] = idx
dic['judul'] = judul dic['judul'] = judul
...@@ -209,16 +224,16 @@ def detail(nomor): ...@@ -209,16 +224,16 @@ def detail(nomor):
tree.parse("apps/data/dataset_STBI.xml") tree.parse("apps/data/dataset_STBI.xml")
all_doc_no = [] all_doc_no = []
all_headline = [] all_song = []
all_text = [] all_text = []
for node in tree.iter("DOCNO"): for node in tree.iter("DOCNO"):
all_doc_no.append(node.text) all_doc_no.append(node.text)
for node in tree.iter("SONG"): for node in tree.iter("SONG"):
# all_headline.append(node.text.replace("\n"," ")) # all_song.append(node.text.replace("\n"," "))
all_headline.append(node.text) all_song.append(node.text)
head = all_headline head = all_song
for node in tree.iter("LYRICS"): for node in tree.iter("LYRICS"):
# all_text.append(node.text.replace("\n"," ")) # all_text.append(node.text.replace("\n"," "))
...@@ -233,5 +248,5 @@ def detail(nomor): ...@@ -233,5 +248,5 @@ def detail(nomor):
check = all_doc_no[i] check = all_doc_no[i]
if check == id: if check == id:
text = all_text[i] text = all_text[i]
judul = all_headline[i] judul = all_song[i]
return text,judul return text,judul
\ No newline at end of file
...@@ -55,6 +55,15 @@ footer { ...@@ -55,6 +55,15 @@ footer {
border-radius: 15px; border-radius: 15px;
padding: 20px; padding: 20px;
margin-top: 10px; margin-top: 10px;
width: auto;
}
.carda {
box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2);
border-radius: 15px;
padding: 20px;
margin-top: 10px;
width: max-content;
} }
.jumbotron { .jumbotron {
......
...@@ -5,35 +5,103 @@ ...@@ -5,35 +5,103 @@
<meta name="viewport" content="width=device-width, initial-scale=1"> <meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title> <title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet"> <link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
<style>
#leftbox {
float:left;
white-space: nowrap;
}
#middlebox{
float:left;
white-space: nowrap;
}
#middleboxa{
float:left;
white-space: nowrap;
}
#rightbox{
float:right;
white-space: nowrap;
}
</style>
</head> </head>
<body> <body>
<main> <main>
<div id="content"> <div id="content">
<article class="card"> <article class="card">
<div align="right"> <div>
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button> <div>
</div> <button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
<center><h1>Dataset</h1><br> </div>
<table style="width:100%"> <div align="right">
<tr> <button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
<th>DOCNO</th> </div>
<th>SONG</th> </div>
<th>ARTIST</th>
<th>LYRICS</th> <center><h1>Dataset</h1><br></center>
</tr> <article class="carda" style="overflow-x:scroll; overflow-y:scroll;">
{% for l in LYRICS %} <div id = "leftbox">
<tr> <table>
<td>{{ i }}</td> <tr>
<td>{{ j }}</td> <th>DOCNO</th>
<td>{{ k }}</td> </tr>
<td>{{ l }}</td>
</tr> {% for i in DOCNO %}
{% endfor %} <tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table align="left">
<tr>
<th>SONG</th>
</tr>
{% for i in SONG %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>ARTIST</th>
</tr>
{% for i in ARTIST %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
<div id = "middlebox">
<table>
<tr>
<th>LYRICS</th>
</tr>
{% for i in LYRICS %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</div>
</table> </article>
</center>
</article> </article>
</div> </div>
...@@ -46,9 +114,13 @@ ...@@ -46,9 +114,13 @@
</body> </body>
<script> <script>
function pageRedirect() { function pageRedirect_prev() {
window.location.href = "/preprocessing"; window.location.href = "/home";
} }
function pageRedirect_next() {
window.location.href = "/preprocessing";
}
</script> </script>
</html> </html>
...@@ -9,7 +9,12 @@ ...@@ -9,7 +9,12 @@
<body> <body>
<main> <main>
<div id="content"> <div id="content">
<article class="card"> <article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
</div>
<div class="row"> <div class="row">
<center><h1 style="font-size:45px">Searching!<br></h1> <center><h1 style="font-size:45px">Searching!<br></h1>
<p style="font-size:20px"><strong>Silahkan masukkan lirik dari lagu yang ingin Anda temukan</strong></p> <p style="font-size:20px"><strong>Silahkan masukkan lirik dari lagu yang ingin Anda temukan</strong></p>
...@@ -27,10 +32,14 @@ ...@@ -27,10 +32,14 @@
</div> </div>
</main> </main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body> </body>
<script>
function pageRedirect_prev() {
window.location.href = "/indexing";
}
</script>
</html> </html>
...@@ -11,25 +11,44 @@ ...@@ -11,25 +11,44 @@
<main> <main>
<div id="content"> <div id="content">
<article class="card"> <article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right"> <div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button> <button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div> </div>
</div>
<center><h1>Indexing</h1><br></center> <center><h1>Indexing</h1><br></center>
<p><strong>Dengan Proximity Index</strong></p><br></center>
<table style="width:100%">
<tr>
<th>Apa judulnya ya?</th>
</tr>
{% for i in indexnya %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article> </article>
</div> </div>
</main> </main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body> </body>
<script> <script>
function pageRedirect() { function pageRedirect_prev() {
window.location.href = "/index"; window.location.href = "/preprocessing4";
} }
function pageRedirect_next() {
window.location.href = "/index";
}
</script> </script>
</html> </html>
...@@ -11,25 +11,45 @@ ...@@ -11,25 +11,45 @@
<main> <main>
<div id="content"> <div id="content">
<article class="card"> <article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right"> <div align="right">
<button onclick="pageRedirect()" class="button" style="vertical-align:middle"><span>Next</span></button> <button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div> </div>
<center><h1>Text Preprocessing</h1><br></center> </div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 1</strong></p>
<p><strong>After Punctuation Removal and Tokenization</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article> </article>
</div> </div>
</main> </main>
<footer>
<p>&copy; STBI-2020-03</p>
</footer>
</body> </body>
<script> <script>
function pageRedirect() { function pageRedirect_prev() {
window.location.href = "/indexing"; window.location.href = "/dataframe";
} }
function pageRedirect_next() {
window.location.href = "/preprocessing2";
}
</script> </script>
</html> </html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 2</strong></p>
<p><strong>After Case Folding</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing";
}
function pageRedirect_next() {
window.location.href = "/preprocessing3";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 3</strong></p>
<p><strong>After Stopwords Removal</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing2";
}
function pageRedirect_next() {
window.location.href = "/preprocessing4";
}
</script>
</html>
<!DOCTYPE html>
<html lang="en">
<head>
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>Song Lyric Search Engine</title>
<link href="../../static/assets/css/dataframe.min.css" rel="stylesheet">
</head>
<body>
<main>
<div id="content">
<article class="card">
<div>
<div>
<button onclick="pageRedirect_prev()" class="button" style="vertical-align:middle"><span>Previous</span></button>
</div>
<div align="right">
<button onclick="pageRedirect_next()" class="button" style="vertical-align:middle"><span>Next</span></button>
</div>
</div>
<center><p style="font-size:40px;"><strong>Text Preprocessing - 4</strong></p>
<p><strong>After Normalization</strong></p><br></center>
<table style="width:100%">
<tr>
<th>All tokens for each document</th>
</tr>
{% for i in tokens_doc %}
<tr>
<td>{{ i }}</td>
</tr>
{% endfor %}
</table>
</article>
</div>
</main>
</body>
<script>
function pageRedirect_prev() {
window.location.href = "/preprocessing3";
}
function pageRedirect_next() {
window.location.href = "/indexing";
}
</script>
</html>
...@@ -10,6 +10,9 @@ urlpatterns = [ ...@@ -10,6 +10,9 @@ urlpatterns = [
path('', views.home), path('', views.home),
path('dataframe/', views.dataframe), path('dataframe/', views.dataframe),
path('preprocessing/', views.preprocessing), path('preprocessing/', views.preprocessing),
path('preprocessing2/', views.preprocessing2),
path('preprocessing3/', views.preprocessing3),
path('preprocessing4/', views.preprocessing4),
path('indexing/', views.indexing), path('indexing/', views.indexing),
path('index/', views.index), path('index/', views.index),
path('result/', views.result), path('result/', views.result),
......
...@@ -39,10 +39,246 @@ def dataframe(request): ...@@ -39,10 +39,246 @@ def dataframe(request):
return render(request, 'apps/dataframe.html', context) return render(request, 'apps/dataframe.html', context)
def preprocessing(request): def preprocessing(request):
return render(request, 'apps/preprocessing.html')
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing.html', context)
def preprocessing2(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing2.html', context)
def preprocessing3(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing3.html', context)
def preprocessing4(request):
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
context = {"tokens_doc": tokens_doc}
return render(request, 'apps/preprocessing4.html', context)
def indexing(request): def indexing(request):
return render(request, 'apps/indexing.html') from sklearn.feature_extraction.text import CountVectorizer
from xml.etree.ElementTree import ElementTree
tree = ElementTree()
tree.parse("InvertedIndexSimulator/data/dataset_STBI.xml")
all_doc_no = []
all_song = []
all_text = []
for node in tree.iter("DOCNO"):
all_doc_no.append(node.text)
for node in tree.iter("SONG"):
all_song.append(node.text)
for node in tree.iter("LYRICS"):
all_text.append(node.text)
N_DOC = len(all_text)
all_sentence_doc = []
for i in range(N_DOC):
all_sentence_doc.append(all_song[i] + all_text[i])
tokens_doc = []
for i in range(N_DOC):
tokens_doc.append(main.remove_punc_tokenize(all_sentence_doc[i]))
for i in range(N_DOC):
tokens_doc[i] = main.to_lower(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = main.stop_word_token(tokens_doc[i])
for i in range(N_DOC):
tokens_doc[i] = ([w for w in tokens_doc[i] if not any(j.isdigit() for j in w)])
for i in range(N_DOC):
tokens_doc[i] = main.stemming(tokens_doc[i])
all_tokens =[]
for i in range(N_DOC):
for j in tokens_doc[i]:
all_tokens.append(j)
new_sentences = ' '.join([w for w in all_tokens])
for j in CountVectorizer().build_tokenizer()(new_sentences):
all_tokens.append(j)
all_tokens = set(all_tokens)
from itertools import count
try:
from future_builtins import zip
except ImportError: # not 2.6+ or is 3.x
try:
from itertools import izip as zip # < 2.5 or 3.x
except ImportError:
pass
proximity_index = {}
for token in all_tokens:
dict_doc_position = {}
for n in range(N_DOC):
if(token in tokens_doc[n]):
dict_doc_position[all_doc_no[n].firstChild.data] = [i+1 for i, j in zip(count(), tokens_doc[n]) if j == token]
proximity_index[token] = dict_doc_position
import collections
proximity_index = collections.OrderedDict(sorted(proximity_index.items()))
for key, value in proximity_index.items():
indexnya = (key, value)
context = {"indexnya": indexnya}
return render(request, 'apps/indexing.html', context)
def index(request): def index(request):
return render(request, 'apps/index.html') return render(request, 'apps/index.html')
......
...@@ -22,6 +22,9 @@ urlpatterns = [ ...@@ -22,6 +22,9 @@ urlpatterns = [
path('', views.home), path('', views.home),
path('dataframe/', views.dataframe), path('dataframe/', views.dataframe),
path('preprocessing/', views.preprocessing), path('preprocessing/', views.preprocessing),
path('preprocessing2/', views.preprocessing2),
path('preprocessing3/', views.preprocessing3),
path('preprocessing4/', views.preprocessing4),
path('indexing/', views.indexing), path('indexing/', views.indexing),
path('index/', views.index), path('index/', views.index),
path('result/', views.result), path('result/', views.result),
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment