Skip to content

Commit 034f694

Browse files
authored
Merge pull request #3 from JuliaMusic/weights
allow weights in sampling
2 parents 2e562dc + 5a5151e commit 034f694

5 files changed

+42
-17
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,8 @@ This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html
77
# master
88
work in progress changes are contained in this section.
99

10+
# v0.2.0
11+
Keyword argument `weights` allows you to choose weights for the motifs that are used in the initial sequence.
12+
1013
# v0.1.0 - Initial Release
1114
Changelog is kept with respect to this release.

REQUIRE

+1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
julia 0.7.0-beta2
22
Combinatorics 0.6.0
3+
StatsBase 0.23.0

src/MotifSequenceGenerator.jl

+18-17
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ All main functionality is given by the function [`random_sequence`](@ref).
66
"""
77
module MotifSequenceGenerator
88

9-
using Combinatorics, Random
9+
using Combinatorics, Random, StatsBase
1010

1111
export random_sequence, all_possible_sums
1212

@@ -23,11 +23,12 @@ Base.showerror(io::IO, e::DeadEndMotifs) = print(io,
2323
The algorithm works as follows: First a random sequence of motifs is created,
2424
so that it has length of `q - δq ≤ ℓ ≤ q - δq + maximum(motiflengths)`.
2525
The possible tries of random sequences is set by the `tries` keyword (default `5`).
26+
The sequence is optionally sampled given a probability vector.
2627
27-
For each random try, it is first check whether the sequence is already correct.
28+
For each random try, it is first checked whether the sequence is already correct.
2829
If not, the last entry of the sequence is dropped. Then, since the sequence is now
2930
already smaller than `q`, all possible sums of `summands` out of the motif pool
30-
are checked. If some combination of `summands` sums to exactly the difference,
31+
are checked. If some combination of `summands` sums to the difference,
3132
they are added to the sequence.
3233
For multiple satisfactory combinations, a random one is picked.
3334
@@ -50,7 +51,9 @@ a proper one, an error is thrown.
5051
Create a random sequence of motifs of type `M`, under the constraint that the
5152
sequence has "length" `ℓ` **exactly** within `q - δq ≤ ℓ ≤ q + δq`.
5253
Return the sequence itself as well as the
53-
sequence of indices of `motifs` used to create it.
54+
sequence of indices of `motifs` used to create it. A vector of probabilities `weights`
55+
can be given as a keyword argument, which then dictates the sampling probability
56+
for each entry of `motifs` for the initial sequence created.
5457
5558
"length" here means an abstracted length defined by the struct `M`,
5659
based on the `limits` and `translate` functions.
@@ -64,7 +67,7 @@ It does **not** refer to the amount of elements!
6467
motif which is translated by `t` (either negative or positive), with
6568
respect to the same units as `q`.
6669
67-
## Keywords
70+
## Other Keywords
6871
Please see the source code (use `@which`) for a full description of the algorithm.
6972
7073
* `tries = 5` : Up to how many initial random sequences are accepted.
@@ -74,7 +77,10 @@ Please see the source code (use `@which`) for a full description of the algorith
7477
"""
7578
function random_sequence(motifs::Vector{M}, q,
7679
limits, translate, δq = 0;
77-
tries = 5, summands = 3, tailcut = 2) where {M}
80+
tries = 5, summands = 3, tailcut = 2,
81+
weights = ones(length(motifs))) where {M}
82+
83+
ws = _toweight(weights)
7884

7985
idxs = 1:length(motifs)
8086
motifs0, motiflens = _motifs_at_origin(motifs, limits, translate)
@@ -94,7 +100,7 @@ function random_sequence(motifs::Vector{M}, q,
94100
while worked == false
95101
count > tries && throw(DeadEndMotifs(tries, summands, tailcut))
96102

97-
seq, seq_length = _random_sequence_try(motiflens, q, δq)
103+
seq, seq_length = _random_sequence_try(motiflens, q, δq, ws)
98104

99105
worked = _complete_sequence!(seq, motiflens, q, δq, summands, tailcut)
100106

@@ -104,6 +110,8 @@ function random_sequence(motifs::Vector{M}, q,
104110
return _instantiate_sequence(motifs0, motiflens, seq, translate), seq
105111
end
106112

113+
_toweight(a) = (s = sum(a); ProbabilityWeights(a./s, 1))
114+
107115
"""
108116
_motifs_at_origin(motifs, limits, translate) -> (motifs0, motiflens)
109117
Bring all motifs to the origin and compute the motif lengths.
@@ -121,15 +129,15 @@ function _motifs_at_origin(motifs::Vector{M}, limits, translate) where M
121129
end
122130

123131
"""
124-
_random_sequence_try(motiflens, q) -> seq, seq_length
132+
_random_sequence_try(motiflens, q, δq [, ws]) -> seq, seq_length
125133
Return a random sequence of motif indices
126134
so that the total sequence is *guaranteed* to have total length of
127135
`q - δq ≤ ℓ ≤ q - δq + maximum(motiflens)`.
128136
"""
129-
function _random_sequence_try(motiflens, q, δq)
137+
function _random_sequence_try(motiflens, q, δq, ws = defaultweights(motiflens))
130138
seq = Int[]; seq_length = 0; idxs = 1:length(motiflens)
131139
while seq_length < q - δq
132-
i = rand(idxs)
140+
i = sample(idxs, ws)
133141
push!(seq, i)
134142
seq_length += motiflens[i]
135143
end
@@ -174,13 +182,6 @@ function _complete_sequence_remainder!(seq, motiflens, q, δq, summands, tailcut
174182
pop!(seq)
175183
isempty(seq) && return false
176184

177-
# I Think the following if is unecessary?...
178-
# if q - δq - sum(motiflens[k] for k in seq) < 0
179-
# ok = _complete_sequence_extra!(seq, motiflens, q, δq)
180-
# isempty(seq) && return false
181-
# ok && return true
182-
# end
183-
184185
# At this point ℓ is guaranteed less than q - δq
185186
remainder = q - δq - sum(motiflens[k] for k in seq)
186187
@assert remainder > 0

test/float_length_tests.jl

+10
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,16 @@ end
3535
end
3636
end
3737

38+
@testset "Float Length, Weights, δq=$(δq)" for δq in [1.0, 2.0]
39+
weights = rand(1:5, N)
40+
for j in 1:N
41+
r, s = random_sequence(shouts, q, shoutlimits, shouttranslate, δq;
42+
weights = weights)
43+
= shoutlens(r)
44+
@test q - δq q + δq
45+
end
46+
end
47+
3848
using MotifSequenceGenerator: DeadEndMotifs
3949
@test_throws ArgumentError random_sequence(shouts, q, shoutlimits, shouttranslate, 0.0)
4050
@test_throws DeadEndMotifs random_sequence(shouts, q, shoutlimits, shouttranslate, 0.000001)

test/integer_length_tests.jl

+10
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,16 @@ end
3434
end
3535
end
3636

37+
@testset "Integer Length, Weights, δq=$(δq)" for δq in [0, 2]
38+
weights = rand(1:5, N)
39+
for j in 1:N
40+
r, s = random_sequence(shouts, q, shoutlimits, shouttranslate, δq;
41+
tries = 10, weights = weights)
42+
= shoutlens(r)
43+
@test q - δq q + δq
44+
end
45+
end
46+
3747
using MotifSequenceGenerator: DeadEndMotifs
3848
@test_throws DeadEndMotifs random_sequence(shouts, 7, shoutlimits, shouttranslate, 0)
3949

0 commit comments

Comments
 (0)