Documente Academic
Documente Profesional
Documente Cultură
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
In [60]:
df=pd.read_csv(r"amazon_co-ecommerce_sample.csv")
df
Out[60]:
Hornby 2014
0 eac7efa5dbd3d667f26eb3d3ab504464 Hornby £3.42
Catalogue
FunkyBuys®
Large
1 b17540ef7e86e461d37f3ae58b7b72ac Christmas FunkyBuys £16.99
Holiday
Express Fes...
CLASSIC TOY
TRAIN SET
2 348f344247b0c1a935b1223072ef9d8a TRACK ccf £9.99
CARRIAGES
LIGHT EN...
HORNBY
Coach R4410A
3 e12b92dbb8eaee78b22965d2a9bbbd9f BR Hornby £39.99
Hawksworth
Corridor 3rd
Hornby 00
Gauge 0-4-0
4 e33a9adeed5f36840ccc227db4682a36 Hornby £32.19
Gildenlow Salt
Co. Steam...
20pcs Model
Garden Light
5 cb34f0a84102c1ebc3ef6892d7444d36 Generic £6.99
Double Heads
Lamppost...
Hornby 00
Gauge 230mm
6 f74b562470571dfb689324adf236f82c BR Bogie Hornby £24.99
Passenger
Brake...
Hornby Santa's
7 87bbb472ef9d90dcef140a551665c929 Express Train Hornby £69.93
Set
Hornby Gauge
Western
8 7e2aa2b4596a39ba852449718413d7cc Hornby £235.58
Express Digital
Train Set...
Learning Curve
Chuggington
9 5afbaf65680c9f378af5b3a3ae22427e Chuggington NaN
Interactive
Chatsworth
Hornby Gauge
Railroad
10 5c76389a8c302c6d7d6e179393031b97 Mosley Hornby £27.49
Tarmacadam
Locomo...
Kato (USA)
176-1308 F3B
11 878048c41f3c249badb3704e160b4c6e Kato (USA) £273.60
Denver & Rio
Grande We...
uniq_id product_name manufacturer price number_ava
Bachmann 37-
662 14 Ton
12 f910c6542ededa5abf81787c0fd87c99 Bachmann £9.60
Tank Wagon
Pease & Part...
Hornby 00
Gauge 253mm
13 03a35de3f7af9814978e6de645cb8ffa Weathered Hornby £119.50
Paviland
Grang...
Kato 3060-2
EF65 500 (F
14 c68c3ae3b0ea3146beae99f3d4a6997c Kato NaN
Model) Electric
Locomo...
Glacier
Express of N
15 d27964f50577db8e46f2069b050c62c6 Kato NaN
gauge 10-1219
Alps [UNESC...
Power Trains
Freight
16 c3b2f6ec9cf6250c960c26ee8ad33509 Power Trains NaN
Industrial (Pack
of 4)
Chuggington
Interactive
17 e9ef14a0253f074343b5441540d8471f Chuggington NaN
Wash and Fuel
Set with...
Kumoyuni 74-0
18 0ca63377ca8015e585efa1d418f04756 Shonan Color Kato £17.08
(Model Train)
Bachmann 31-
588
19 42d20aa25e5902eff9f5d307bb38230e Freightliner Bachmann £96.05
Class 70 005
Powe...
Preiser 30495
Horse Drawn
20 4aaa27925929708a6b56fd7e46b35208 Wedding Preiser £27.55
Coach
(Closed)
Preiser 30414
Horse Drawn
21 81b3ad5c07a31fa67e4dc8643afa3275 Preiser £24.50
Liquid Manure
Wagon
Bachmann
Class A2
22 26de90be8191b92ea208c2111c7090cd 60534 'Irish Bachmann £149.92
Elegance' BR
Li...
Plarail - S-29
Steam
23 4eb5ea748ea030b14c3d1789f9578133 Locomotive Takara Tomy £12.87
Type C61-20
w/...
Roco 64723
OBB Railjet
24 1b3d6d9f9c79db8c54c0c2cbb50de1cf Roco £49.95
Economy
Coach V
uniq_id product_name manufacturer price number_ava
Learning Curve
Chuggington
25 79fd94b655dbcf797005e39e700eb3f1 Chuggington NaN
Interactive
Brewster
Plarail - AS-07
Shinkansen
26 6c4062e5b0da136365915b470ec9d4a4 Takara Tomy £14.44
Series 700
(Model T...
Thomas and
27 3b4be663a9878555100854f626202b17 Friends Take- Fisher-Price NaN
n-Play Molly
Hornby R2981
London 2012
28 8e8be9b29d3b1a794a906510fad8a75d Hornby NaN
1948 Games
00 Gauge L...
Gaugemaster
HBYS Hornby
29 8a9af1fc5f227191c00447b0c44f7700 Gaugemaster £5.95
Type Colour
Light Sign...
Funko POP!
9972 c5dc2700783716cce0b0b9118bdd17c0 Harry Potter FunKo £13.90
Sword
Hot Toys -
Batman The
9973 9736294cd697539a88eeb851dfdd2771 Hot Toys £399.99
Dark Knight
Rises Movie ...
Lavender
Brown
Noble
9974 debdd5dc7adae2e92ef590ce4612dd5c Character £26.99
Collection
Wand. Harry
Potter No...
GAME OF
THRONES
9975 d8972b4f08f551d6b7d7ce23c5bf95ed Abystyle £15.63
Flag Stark
(70x120)
Batman The
Dark Knight
9976 569b2e8a3aef1831cd68fe1de287c809 Batman NaN
Batarang Prop
Replica W...
Star Wars
Costume, Kids
9977 c38c62b7eaf8de33f0addff73b17f807 Star Wars £26.99
Han Solo Outfit
Style ...
uniq_id product_name manufacturer price number_ava
Star Wars
9978 fca7110160af8a481fc94e9a9c8a07e3 Filled Desk Star Wars £2.89
Tidy
Albus
Dumbledore
9979 ccfd41e97f866080af3f17bc64120e3d (Harry Potter Mattel £63.73
Magical Minis
C...
Square Enix
Halo Reach
9980 1c0b975783a1c9f25ba226d4394bf46d Square-Enix £39.48
Play Arts Kai
Series 1 ...
HSDS030BO
foam tray for
9981 ab9fe33566df86b90c82ed78e38e08c7 Star Wars Feldherr £3.70
Armada Wave
...
Star Wars
Clone Wars -
9982 7fa63c36b92bc52acd490c5774ea4101 Obi-Wan Hasbro £58.18
Kenobi's
Starfi...
Dc Comics
Infinite Crisis
9983 32165eaeb83b3e5e772f7bd5b31ac13b DC Comics NaN
Pajama Party
Harley ...
Master
Replicas -
Master
9984 df6b6fa9e9d4d0994ac3fa67cd3ded71 Clone Trooper NaN
Replicas
Helmet Scaled
...
Playskool
Heroes Super
9985 5962c0a2623129d656aac1ee19239dcb Hero Marvel £11.95
Adventures
Action ...
Marauder's
9986 6b9c92678116a53b8d5a656b64cbcabb map wallscroll - GGS £31.14
Harry Potter
Thundercats
9987 0c99ecefabf3ec25d4fae53d0646fb3b 10cm Action Thundercats £11.44
Figure: Wilykat
Captain
America - The Captain
9988 8cc983a24c305a6dc91e3d8e6e421a72 £25.33
First Avenger - america
Movie Se...
Teen Titans
Shape-Shifting
9989 1fbd746051cc006738435a8d64d1e1d1 Ban Dai £29.99
Beast Boy 5"
inch F...
Iron Maiden 8-
Inch Eddie 2
9990 cf75a470360f08eaac9e4d9882999cee IronMan £29.79
Mintutes To
Midnigh...
uniq_id product_name manufacturer price number_ava
Power Rangers
Dino Charge Power
9991 57c638712b4ecb8dac1ec3004039f1f1 £9.75
30 cm Blue Rangers
Ranger Fi...
Playskool
Heroes Super
9993 3e64e4223988a85f6884c8c6a85a75cc Super Heroes £9.99
Hero Repulsor
Drill Veh...
Factory
Entertainment
9994 791719b23e393dc4a3384d4d7777c089 Green Hornet £9.50
Green Hornet
Movie: Kato...
Batman 1966
TV Series
9995 44d6967f083825a5de36ad4865a65bcd Mattel £22.95
Action Figures
- The Rid...
Star Wars
Costume, Kids
9996 08f0747b6fc6687215ffb994c3a6fb32 Star Wars £39.99
Stormtrooper
Costume S...
Defiance
Lawkeeper Olde Scotland
9997 bf6cc073f8f24e6e338190fa16f6ee9d £43.99
Metal Badge Yard Ltd.
Prop Replica
Justice League
of America
9998 cd783d0b8b44e631b9788b203eaaefae DC Comics £49.81
Series 3 Green
Lante...
df.isnull().sum()
Out[61]:
uniq_id 0
product_name 0
manufacturer 7
price 1435
number_available_in_stock 2500
number_of_reviews 18
number_of_answered_questions 765
average_review_rating 18
amazon_category_and_sub_category 690
customers_who_bought_this_item_also_bought 1062
description 651
product_information 58
product_description 651
items_customers_buy_after_viewing_this_item 3065
customer_questions_and_answers 9086
customer_reviews 21
sellers 3082
dtype: int64
In [62]:
Out[62]:
uniq_id 0
product_name 0
manufacturer 7
price 1432
number_of_reviews 0
number_of_answered_questions 764
average_review_rating 0
amazon_category_and_sub_category 684
description 647
product_information 58
product_description 647
customer_reviews 3
dtype: int64
In [63]:
df.dtypes
Out[63]:
uniq_id object
product_name object
manufacturer object
price object
number_of_reviews object
number_of_answered_questions float64
average_review_rating object
amazon_category_and_sub_category object
description object
product_information object
product_description object
customer_reviews object
dtype: object
In [64]:
df['number_of_reviews'] = df['number_of_reviews'].str.replace(',','')
In [65]:
df['number_of_reviews'] = df['number_of_reviews'].astype(np.int64)
In [66]:
df.shape
Out[66]:
(9982, 12)
In [67]:
df['customer_reviews'][3]
Out[67]:
In [68]:
df=df['customer_reviews'].str.split("//",n=4,expand=True)
In [69]:
df.head()
Out[69]:
0 1 2 3 4
Worth Buying For The 6 April By\n \n Copnovelist\n Part of the magic for me
0 4.0
Pictures Alone (As Ever) 2014 \n on 6 April 2014 growing up as a boy ...
18
By\n \n kenneth bell\n Very happy with the
1 Four Stars 4.0 Dec.
\n on 18 Dec. 2... communication with funkyb...
2015
26
By\n \n Simon.B :-)\n Simple & GREAT FUN for
2 **Highly Recommended!** 5.0 May
\n on 26 May 2015 5+My nephews face was ...
2015
22 July By\n \n Lilla Lukacs\n I love it. Perfect with the earlier
3 I love it 5.0
2013 \n on 22 July 2... ordered l...
14
By\n \n Love my Dog\n Bought this for my Grandson's
4 Birthday present 5.0 April
\n on 14 April 2... birthday. He i...
2014
In [70]:
df['review title']=df[0]
df['rating']=df[1]
df['review_date']=df[2]
df['customer_name']=df[3]
df['review']=df[4]
df.head()
Out[70]:
Simple &
By\n \n
26 GREAT FUN
**Highly Simon.B :-)\n **Highly 26 M
2 5.0 May for 5+My 5.0
Recommended!** \n on 26 May Recommended!** 20
2015 nephews face
2015
was ...
By\n \n Love
14 Bought this for
my Dog\n \n 14 A
4 Birthday present 5.0 April my Grandson's Birthday present 5.0
on 14 April 20
2014 birthday. He i...
2...
In [71]:
df.drop(columns=0,inplace=True)
df.drop(columns=1,inplace=True)
df.drop(columns=2,inplace=True)
df.drop(columns=3,inplace=True)
df.drop(columns=4,inplace=True)
df.head()
Out[71]:
By\n \n
Worth Buying For The Part of the magic for me
0 4.0 6 April 2014 Copnovelist\n \n on
Pictures Alone (As Ever) growing up as a boy ...
6 April 2014
By\n \n Lilla
I love it. Perfect with the
3 I love it 5.0 22 July 2013 Lukacs\n \n on 22
earlier ordered l...
July 2...
In [72]:
df['customer_name'][1]
Out[72]:
df['customer_name']=df['customer_name'].str.split("\n \n",n=1,expand=True)
df.head()
Out[73]:
Worth Buying For The Pictures By\n \n Part of the magic for me
0 4.0 6 April 2014
Alone (As Ever) Copnovelist growing up as a boy ...
df['review']=df['review'].str.lower()
df['review']
Out[74]:
In [75]:
import string
df['review']= df['review'].replace(string.punctuation,'')
In [76]:
df['review'].head()
Out[76]:
In [19]:
df.isnull().sum()
Out[19]:
review title 3
rating 3
review_date 3
customer_name 3
review 3
dtype: int64
In [20]:
df.dropna(subset=['review'], inplace=True)
In [21]:
df.isnull().sum()
Out[21]:
review title 0
rating 0
review_date 0
customer_name 0
review 0
dtype: int64
In [22]:
import nltk
df['review']=df['review'].apply(nltk.word_tokenize)
In [23]:
df['review'].head(5)
Out[23]:
In [24]:
def remove_stopwords(text):
words=[w for w in text if w not in stopwords.words('english')]
return words
In [25]:
df['review']=df['review'].apply(lambda x:remove_stopwords(x))
df['review'].head(5)
Out[25]:
In [27]:
lemmatizer= WordNetLemmatizer()
def word_lemmatizer(text):
lem_text=[lemmatizer.lemmatize(i) for i in text ]
return lem_text
In [29]:
import nltk
nltk.download('wordnet')
Out[29]:
True
In [30]:
df['review']=df['review'].apply(lambda x: word_lemmatizer(x))
In [31]:
df['review'].head(5)
Out[31]:
In [34]:
In [35]:
df['review']=df['review'].apply(lambda x: word_stemmer(x))
In [36]:
df['review'].head(5)
Out[36]:
0 partmagicgrowboybuy(given)newhornbicatalogueve...
1 happicommunfunkybuy|fivestar//5.0//14jan.2016/...
2 simpl&greatfun5+minephewfaceamazopenbirthday!!...
3 love.perfectearlierorderlocomotive.again:would...
4 boughtgrandson'sbirthday.currentcollectbittrai...
Name: review, dtype: object
In [ ]: