Reputation: 207
I have the raw dataset. The below raw dataset, which is sample data, has time and sentiment(positive, neutral, negative).
This raw dataset is :
created_time neg_sentiment neu_sentiment pos_sentiment
2015-01-12T23:27:53+0000 0 0 1
2015-01-13T00:36:15+0000 0 0 1
2015-01-13T00:39:37+0000 0.02 0 0.98
2015-01-13T01:26:05+0000 0.41 0.59 0
2015-01-15T16:10:46+0000 0.14 0.02 0.84
2015-02-13T02:38:59+0000 0.86 0.1 0
2015-01-13T21:00:15+0000 1 0 0
2015-01-14T04:47:47+0000 0.96 0.04 0
2015-02-14T06:09:17+0000 1 0 0
2015-02-14T06:10:05+0000 1 0 0
2015-01-14T06:44:47+0000 0.65 0.3 0
2015-03-14T06:47:13+0000 0.07 0.93 0
2015-01-14T10:16:09+0000 0 0 1
2015-01-14T10:17:38+0000 0.08 0.85 0.07
2015-01-14T17:30:03+0000 1 0 0
2015-01-14T20:17:43+0000 0.11 0 0.89
2015-01-16T02:49:13+0000 0.5 0.5 0
2015-03-26T13:20:06+0000 1 0 0
2015-01-21T04:26:45+0000 0.39 0.01 0.6
2015-03-21T04:38:49+0000 0.01 0 0.99
Using this dataset, I want to make the two desired outputs :
negative_proportion is calculated by neg_sentiment/(neg_sentiment + neu_sentiment + pos_sentiment) The first output is by month:
created_time negative_proportion
2015-01 10
2015-02 20
2015-03 5
The second output is by day:
created_time negative_proportion
2015-01-12 10
2015-01-13 20
2015-01-14 3
2015-01-15 3
2015-01-16 3
2015-02-13 3
2015-02-14 3
2015-03-14 3
2015-03-21 3
2015-03-26 5
How could I make the desired output? Could you please help me or suggest the code?
The generated "dput" data based on original dataset is below
structure(list(created_time = structure(c(1L, 2L, 3L, 4L, 12L,
15L, 5L, 6L, 16L, 17L, 7L, 18L, 8L, 9L, 10L, 11L, 13L, 20L, 14L,
19L), .Label = c("2015-01-12T23:27:53+0000", "2015-01-13T00:36:15+0000",
"2015-01-13T00:39:37+0000", "2015-01-13T01:26:05+0000", "2015-01-13T21:00:15+0000",
"2015-01-14T04:47:47+0000", "2015-01-14T06:44:47+0000", "2015-01-14T10:16:09+0000",
"2015-01-14T10:17:38+0000", "2015-01-14T17:30:03+0000", "2015-01-14T20:17:43+0000",
"2015-01-15T16:10:46+0000", "2015-01-16T02:49:13+0000", "2015-01-21T04:26:45+0000",
"2015-02-13T02:38:59+0000", "2015-02-14T06:09:17+0000", "2015-02-14T06:10:05+0000",
"2015-03-14T06:47:13+0000", "2015-03-21T04:38:49+0000", "2015-03-26T13:20:06+0000"
), class = "factor"), neg_sentiment = c(0, 0, 0.02, 0.41, 0.14,
0.86, 1, 0.96, 1, 1, 0.65, 0.07, 0, 0.08, 1, 0.11, 0.5, 1, 0.39,
0.01), neu_sentiment = c(0, 0, 0, 0.59, 0.02, 0.14, 0, 0.04,
0, 0, 0.35, 0.93, 0, 0.85, 0, 0, 0.5, 0, 0.01, 0), pos_sentiment = c(1,
1, 0.98, 0, 0.84, 0, 0, 0, 0, 0, 0, 0, 1, 0.07, 0, 0.89, 0, 0,
0.6, 0.99)), class = "data.frame", row.names = c(NA, -20L))
Upvotes: 0
Views: 58
Reputation: 102625
Here are the base R codes
# by month
dfout <- aggregate(df[-1], data.frame(created_time = gsub("(\\d+-\\d+).*","\\1",df[,1])), sum)
dfout <- within(dfout, neg_prop <- neg_sentiment/rowSums(dfout[-1])*100)
such that
> dfout
created_time neg_sentiment neu_sentiment pos_sentiment neg_prop
1 2015-01 5.26 2.36 6.38 37.57143
2 2015-02 2.86 0.14 0.00 95.33333
3 2015-03 1.08 0.93 0.99 36.00000
# by day
dfout <- aggregate(df[-1], data.frame(created_time = gsub("(\\d+-\\d+-\\d+).*","\\1",df[,1])), sum)
dfout <- within(dfout, neg_prop <- neg_sentiment/rowSums(dfout[-1])*100)
such that
> dfout
created_time neg_sentiment neu_sentiment pos_sentiment neg_prop
1 2015-01-12 0.00 0.00 1.00 0.00000
2 2015-01-13 1.43 0.59 1.98 35.75000
3 2015-01-14 2.80 1.24 1.96 46.66667
4 2015-01-15 0.14 0.02 0.84 14.00000
5 2015-01-16 0.50 0.50 0.00 50.00000
6 2015-01-21 0.39 0.01 0.60 39.00000
7 2015-02-13 0.86 0.14 0.00 86.00000
8 2015-02-14 2.00 0.00 0.00 100.00000
9 2015-03-14 0.07 0.93 0.00 7.00000
10 2015-03-21 0.01 0.00 0.99 1.00000
11 2015-03-26 1.00 0.00 0.00 100.00000
Upvotes: 1
Reputation: 11
I would use a command like substring() to extract the first 10 characters of your created_time variable to create a variable with the date. You could do this to create a variable for month as well.
data$day <- substring(data$created_time, 1, 10)
#and/or
data$month <- substring(data$created_time, 1, 6)
You've already provided the formula for calculating the negative, so that's easy enough:
data$negative_proportion <- data$neg_sentiment/(data$neg_sentiment + data$neu_sentiment + data$pos_sentiment)
Good luck!
Upvotes: 1
Reputation: 4150
You can use lubridate on created time
library(lubridate)
#>
#> Attaching package: 'lubridate'
#> The following object is masked from 'package:base':
#>
#> date
library(tidyverse)
df_example <- structure(list(created_time = structure(c(1L, 2L, 3L, 4L, 12L,
15L, 5L, 6L, 16L, 17L, 7L, 18L, 8L, 9L, 10L, 11L, 13L, 20L, 14L,
19L), .Label = c("2015-01-12T23:27:53+0000", "2015-01-13T00:36:15+0000",
"2015-01-13T00:39:37+0000", "2015-01-13T01:26:05+0000", "2015-01-13T21:00:15+0000",
"2015-01-14T04:47:47+0000", "2015-01-14T06:44:47+0000", "2015-01-14T10:16:09+0000",
"2015-01-14T10:17:38+0000", "2015-01-14T17:30:03+0000", "2015-01-14T20:17:43+0000",
"2015-01-15T16:10:46+0000", "2015-01-16T02:49:13+0000", "2015-01-21T04:26:45+0000",
"2015-02-13T02:38:59+0000", "2015-02-14T06:09:17+0000", "2015-02-14T06:10:05+0000",
"2015-03-14T06:47:13+0000", "2015-03-21T04:38:49+0000", "2015-03-26T13:20:06+0000"
), class = "factor"), neg_sentiment = c(0, 0, 0.02, 0.41, 0.14,
0.86, 1, 0.96, 1, 1, 0.65, 0.07, 0, 0.08, 1, 0.11, 0.5, 1, 0.39,
0.01), neu_sentiment = c(0, 0, 0, 0.59, 0.02, 0.14, 0, 0.04,
0, 0, 0.35, 0.93, 0, 0.85, 0, 0, 0.5, 0, 0.01, 0), pos_sentiment = c(1,
1, 0.98, 0, 0.84, 0, 0, 0, 0, 0, 0, 0, 1, 0.07, 0, 0.89, 0, 0,
0.6, 0.99)), class = "data.frame", row.names = c(NA, -20L))
df_example %>%
group_by(year(created_time),month(created_time)) %>%
summarise_if(is.numeric,~sum(.,na.rm = TRUE)) %>%
mutate(prop = neg_sentiment/(neg_sentiment + neu_sentiment + pos_sentiment))
#> # A tibble: 3 x 6
#> # Groups: year(created_time) [1]
#> `year(created_t… `month(created_… neg_sentiment neu_sentiment pos_sentiment
#> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 2015 1 5.26 2.36 6.38
#> 2 2015 2 2.86 0.14 0
#> 3 2015 3 1.08 0.93 0.99
#> # … with 1 more variable: prop <dbl>
df_example %>%
group_by(as_date(created_time)) %>%
summarise_if(is.numeric,~sum(.,na.rm = TRUE)) %>%
mutate(prop = neg_sentiment/(neg_sentiment + neu_sentiment + pos_sentiment))
#> # A tibble: 11 x 5
#> `as_date(created_time)` neg_sentiment neu_sentiment pos_sentiment prop
#> <date> <dbl> <dbl> <dbl> <dbl>
#> 1 2015-01-12 0 0 1 0
#> 2 2015-01-13 1.43 0.59 1.98 0.358
#> 3 2015-01-14 2.8 1.24 1.96 0.467
#> 4 2015-01-15 0.14 0.02 0.84 0.14
#> 5 2015-01-16 0.5 0.5 0 0.5
#> 6 2015-01-21 0.39 0.01 0.6 0.39
#> 7 2015-02-13 0.86 0.14 0 0.86
#> 8 2015-02-14 2 0 0 1
#> 9 2015-03-14 0.07 0.93 0 0.07
#> 10 2015-03-21 0.01 0 0.99 0.01
#> 11 2015-03-26 1 0 0 1
Created on 2020-01-08 by the reprex package (v0.3.0)
Upvotes: 1