딥스탯 2018. 10. 12. 23:55
06_rows

Introduction to DataFrames

Bogumił Kamiński, Apr 21, 2018

Reference

Series

In [1]:
using DataFrames, Random # load package
Random.seed!(1); #srand(1);

Manipulating rows of DataFrame

Reordering rows

In [2]:
x = DataFrame(id=1:10, x = rand(10), y = [zeros(5); ones(5)]) # and we hope that x[:x] is not sorted :)
Out[2]:
idxy
Int64Float64Float64
110.2360330.0
220.3465170.0
330.3127070.0
440.007909280.0
550.4886130.0
660.2109681.0
770.9519161.0
880.9999051.0
990.2516621.0
10100.9866661.0
In [3]:
issorted(x), issorted(x, :x) # check if a DataFrame or a subset of its columns is sorted
Out[3]:
(true, false)
In [4]:
sort!(x, :x) # sort x in place
Out[4]:
idxy
Int64Float64Float64
140.007909280.0
260.2109681.0
310.2360330.0
490.2516621.0
530.3127070.0
620.3465170.0
750.4886130.0
870.9519161.0
9100.9866661.0
1080.9999051.0
In [5]:
y = sort(x, :id) # new DataFrame
Out[5]:
idxy
Int64Float64Float64
110.2360330.0
220.3465170.0
330.3127070.0
440.007909280.0
550.4886130.0
660.2109681.0
770.9519161.0
880.9999051.0
990.2516621.0
10100.9866661.0
In [6]:
sort(x, (:y, :x), rev=(true, false)) # sort by two columns, first is decreasing, second is increasing
Out[6]:
idxy
Int64Float64Float64
160.2109681.0
290.2516621.0
370.9519161.0
4100.9866661.0
580.9999051.0
640.007909280.0
710.2360330.0
830.3127070.0
920.3465170.0
1050.4886130.0
In [7]:
sort(x, (order(:y, rev=true), :x)) # the same as above
Out[7]:
idxy
Int64Float64Float64
160.2109681.0
290.2516621.0
370.9519161.0
4100.9866661.0
580.9999051.0
640.007909280.0
710.2360330.0
830.3127070.0
920.3465170.0
1050.4886130.0
In [8]:
sort(x, (order(:y, rev=true), order(:x, by=v->-v))) # some more fancy sorting stuff
Out[8]:
idxy
Int64Float64Float64
180.9999051.0
2100.9866661.0
370.9519161.0
490.2516621.0
560.2109681.0
650.4886130.0
720.3465170.0
830.3127070.0
910.2360330.0
1040.007909280.0
In [9]:
x[shuffle(1:10), :] # reorder rows (here randomly)
Out[9]:
idxy
Int64Float64Float64
180.9999051.0
2100.9866661.0
370.9519161.0
420.3465170.0
550.4886130.0
610.2360330.0
760.2109681.0
890.2516621.0
940.007909280.0
1030.3127070.0
In [10]:
sort!(x, :id)
x[[1,10],:] = x[[10,1],:] # swap rows
x
Out[10]:
idxy
Int64Float64Float64
1100.9866661.0
220.3465170.0
330.3127070.0
440.007909280.0
550.4886130.0
660.2109681.0
770.9519161.0
880.9999051.0
990.2516621.0
1010.2360330.0
In [11]:
x[1,:], x[10,:] = x[10,:], x[1,:] # and swap again
x
Out[11]:
idxy
Int64Float64Float64
110.2360330.0
220.3465170.0
330.3127070.0
440.007909280.0
550.4886130.0
660.2109681.0
770.9519161.0
880.9999051.0
990.2516621.0
10100.9866661.0

Merging/adding rows

In [12]:
x = DataFrame(rand(3, 5))
Out[12]:
x1x2x3x4x5
Float64Float64Float64Float64Float64
10.08563520.1858210.05161460.2793950.370971
20.5532060.1119810.538030.1782460.894166
30.463350.9763120.4556920.5489830.648054
In [13]:
[x; x] # merge by rows - data frames must have the same column names; the same is vcat
Out[13]:
x1x2x3x4x5
Float64Float64Float64Float64Float64
10.08563520.1858210.05161460.2793950.370971
20.5532060.1119810.538030.1782460.894166
30.463350.9763120.4556920.5489830.648054
40.08563520.1858210.05161460.2793950.370971
50.5532060.1119810.538030.1782460.894166
60.463350.9763120.4556920.5489830.648054
In [14]:
y = x[reverse(names(x))] # get y with other order of names
Out[14]:
x5x4x3x2x1
Float64Float64Float64Float64Float64
10.3709710.2793950.05161460.1858210.0856352
20.8941660.1782460.538030.1119810.553206
30.6480540.5489830.4556920.9763120.46335
In [15]:
vcat(x, y) # we get what we want as vcat does column name matching
Out[15]:
x1x2x3x4x5
Float64Float64Float64Float64Float64
10.08563520.1858210.05161460.2793950.370971
20.5532060.1119810.538030.1782460.894166
30.463350.9763120.4556920.5489830.648054
40.08563520.1858210.05161460.2793950.370971
50.5532060.1119810.538030.1782460.894166
60.463350.9763120.4556920.5489830.648054
In [16]:
vcat(x, y[1:3]) # but column names must still match
ArgumentError: column(s) x1 and x2 are missing from argument(s) 2

Stacktrace:
 [1] _vcat(::Array{DataFrame,1}) at /home/yt/.julia/packages/DataFrames/1PqZ3/src/abstractdataframe/abstractdataframe.jl:926
 [2] vcat(::DataFrame, ::DataFrame) at /home/yt/.julia/packages/DataFrames/1PqZ3/src/abstractdataframe/abstractdataframe.jl:906
 [3] top-level scope at In[16]:1
In [17]:
append!(x, x) # the same but modifies x
Out[17]:
x1x2x3x4x5
Float64Float64Float64Float64Float64
10.08563520.1858210.05161460.2793950.370971
20.5532060.1119810.538030.1782460.894166
30.463350.9763120.4556920.5489830.648054
40.08563520.1858210.05161460.2793950.370971
50.5532060.1119810.538030.1782460.894166
60.463350.9763120.4556920.5489830.648054
In [18]:
append!(x, y) # here column names must match exactly
Column names do not match

Stacktrace:
 [1] error(::String) at ./error.jl:33
 [2] append!(::DataFrame, ::DataFrame) at /home/yt/.julia/packages/DataFrames/1PqZ3/src/dataframe/dataframe.jl:990
 [3] top-level scope at In[18]:1
In [19]:
push!(x, 1:5) # add one row to x at the end; must give correct number of values and correct types
x
Out[19]:
x1x2x3x4x5
Float64Float64Float64Float64Float64
10.08563520.1858210.05161460.2793950.370971
20.5532060.1119810.538030.1782460.894166
30.463350.9763120.4556920.5489830.648054
40.08563520.1858210.05161460.2793950.370971
50.5532060.1119810.538030.1782460.894166
60.463350.9763120.4556920.5489830.648054
71.02.03.04.05.0
In [20]:
push!(x, Dict(:x1=> 11, :x2=> 12, :x3=> 13, :x4=> 14, :x5=> 15)) # also works with dictionaries
x
Out[20]:
x1x2x3x4x5
Float64Float64Float64Float64Float64
10.08563520.1858210.05161460.2793950.370971
20.5532060.1119810.538030.1782460.894166
30.463350.9763120.4556920.5489830.648054
40.08563520.1858210.05161460.2793950.370971
50.5532060.1119810.538030.1782460.894166
60.463350.9763120.4556920.5489830.648054
71.02.03.04.05.0
811.012.013.014.015.0

Subsetting/removing rows

In [21]:
x = DataFrame(id=1:10, val='a':'j')
Out[21]:
idval
Int64Char
11'a'
22'b'
33'c'
44'd'
55'e'
66'f'
77'g'
88'h'
99'i'
1010'j'
In [22]:
x[1:2, :] # by index
Out[22]:
idval
Int64Char
11'a'
22'b'
In [23]:
view(x, 1:2) # the same but a view
Out[23]:
idval
Int64Char
11'a'
22'b'
In [24]:
x[repeat([true, false], 5), :] # by Bool, exact length required
#x[repmat([true, false], 5), :]
Out[24]:
idval
Int64Char
11'a'
23'c'
35'e'
47'g'
59'i'
In [25]:
view(x, repeat([true, false], 5), :) # view again
#view(x, repmat([true, false], 5), :)
Out[25]:
idval
Int64Char
11'a'
23'c'
35'e'
47'g'
59'i'
In [26]:
deleterows!(x, 7) # delete one row
Out[26]:
idval
Int64Char
11'a'
22'b'
33'c'
44'd'
55'e'
66'f'
78'h'
89'i'
910'j'
In [27]:
deleterows!(x, 6:7) # delete a collection of rows
Out[27]:
idval
Int64Char
11'a'
22'b'
33'c'
44'd'
55'e'
69'i'
710'j'
In [28]:
x = DataFrame([1:4, 2:5, 3:6])
Out[28]:
x1x2x3
Int64Int64Int64
1123
2234
3345
4456
In [29]:
filter(r -> r[:x1] > 2.5, x) # create a new DataFrame where filtering function operates on DataFrameRow
Out[29]:
x1x2x3
Int64Int64Int64
1345
2456
In [30]:
# in place modification of x, an example with do-block syntax
filter!(x) do r
    if r[:x1] > 2.5
        return r[:x2] < 4.5
    end
    r[:x3] < 3.5
end
Out[30]:
x1x2x3
Int64Int64Int64
1123
2345

Deduplicating

In [31]:
x = DataFrame(A=[1,2], B=["x","y"])
append!(x, x)
x[:C] = 1:4
x
Out[31]:
ABC
Int64StringInt64
11x1
22y2
31x3
42y4
In [32]:
unique(x, [1,2]) # get first unique rows for given index
Out[32]:
ABC
Int64StringInt64
11x1
22y2
In [33]:
unique(x) # now we look at whole rows
Out[33]:
ABC
Int64StringInt64
11x1
22y2
31x3
42y4
In [34]:
nonunique(x, :A) # get indicators of non-unique rows
Out[34]:
4-element Array{Bool,1}:
 false
 false
  true
  true
In [35]:
unique!(x, :B) # modify x in place
Out[35]:
ABC
Int64StringInt64
11x1
22y2

Extracting one row from DataFrame into a vector

In [36]:
x = DataFrame(x=[1,missing,2], y=["a", "b", missing], z=[true,false,true])
Out[36]:
xyz
Int64⍰String⍰Bool
11atrue
2missingbfalse
32missingtrue
In [37]:
cols = [:x, :y]
[x[1, col] for col in cols] # subset of columns
Out[37]:
2-element Array{Any,1}:
 1   
  "a"
In [38]:
[[x[i, col] for col in names(x)] for i in 1:nrow(x)] # vector of vectors, each entry contains one full row of x
Out[38]:
3-element Array{Array{Any,1},1}:
 [1, "a", true]       
 [missing, "b", false]
 [2, missing, true]   
In [39]:
Tuple(x[1, col] for col in cols) # similar construct for Tuples
Out[39]:
(1, "a")