-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathBioshake.hs
189 lines (151 loc) · 6.79 KB
/
Bioshake.hs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
{-# LANGUAGE FlexibleContexts #-}
{-# LANGUAGE FlexibleInstances #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE Rank2Types #-}
{-# LANGUAGE TemplateHaskell #-}
{-# LANGUAGE TypeOperators #-}
{-# LANGUAGE ViewPatterns #-}
-- | Bioshake is a small framework for specifying bioinformatics pipelines. The
-- goal is to specify stages in a forward chaining manner (as is natural for the
-- domain) while guaranteeing as much robustness as possible to errors such as
-- mismatched file types or other attributes. Almost everything is handled in
-- the type system, and pipelines are compiled down to "Development.Shake"
-- 'Rules' for actual execution.
module Bioshake( module Types
, module Data.Reflection
, module Tags
, All(..)
, On(..)
, Referenced(..)
, Capture(..)
, ignoringIOErrors
, withTempDirectory
, bioshake
, out
, Out
, split
, withAll
, withPair) where
import Bioshake.Cluster.Torque
import Bioshake.Tags as Tags
import Bioshake.Types as Types
import qualified Control.Exception as E
import Control.Monad
import Control.Monad.Trans
import Control.Monad.Trans.State.Strict
import Data.List
import Data.Reflection (Given, give, given)
import qualified Data.Set as S
import Data.String
import Development.Shake
import Language.Haskell.TH
import System.Directory (copyFile,
removeDirectoryRecursive)
import System.IO.Temp (createTempDirectory)
-- | Attaches a reference genome.
class Referenced a where
-- | The path to the reference fasta file.
getRef :: a -> FilePath
-- | The short name, e.g., hg19.
name :: a -> String
-- | Path to dbNSFP for the genome
dbnsfp :: a -> FilePath
dbnsfp _ = error "dbNSFP not available"
-- | Gene annotations
annotations :: a -> FilePath
annotations _ = error "annotations not available"
-- | References flows down the pipeline regardless of the stage
instance Referenced a => Referenced (a :-> b) where
getRef (a :-> _) = getRef a
name (a :-> _) = name a
dbnsfp (a :-> _) = dbnsfp a
annotations (a :-> _) = annotations a
-- | Asserts a capture region.
class Capture a where
getBED :: a -> FilePath
instance Capture a => Capture (a :-> b) where
getBED (a :-> _) = getBED a
-- Hard naming outputs
data Out = Out [FilePath] deriving Show
-- | Explicitly names an output product. Outputs are automatically named in the
-- temporary directory except for this special case: this is how you obtain the
-- artifacts you are specifically interested in.
out = Out
instance Pathable (a :-> Out) where
paths (_ :-> Out outs) = outs
instance Pathable a => Buildable (a :-> Out) where
build ((paths -> inputs) :-> Out outs) = zipWithM_ ((liftIO .) . copyFile) inputs outs
$(allTransTagsPipe ''Out)
-- |Datatype to represent fan-in combinations.
data All a where
All :: (Functor f, Foldable f) => f a -> All a
-- |Fan-in style combinator. Takes a collection of combines their output paths
-- as input paths for the subsequent stage.
withAll :: (Functor f, Foldable f) => f a -> All a
withAll = All
-- | Explicitly construct a fan-in of exactly two items
withPair :: a -> a -> All a
withPair a b = All [a, b]
instance Compilable a => Compilable (All a) where
compile (All as) = mapM_ compile as
instance Pathable a => Pathable (All a) where
paths (All ps) = nub $ concatMap paths ps
-- |Fan-ins are 'Referenced' iff all items are consistently 'Referenced'. Problems are caught at runtime unfortunately.
instance Referenced a => Referenced (All a) where
getRef (All as) = foldl1 (\l r -> if l == r then l else error "cannot combine mixed references") $ fmap getRef as
name (All as) = foldl1 (\l r -> if l == r then l else error "cannot combine mixed references") $ fmap name as
dbnsfp (All as) = foldl1 (\l r -> if l == r then l else error "cannot combine mixed references") $ fmap dbnsfp as
annotations (All as) = foldl1 (\l r -> if l == r then l else error "cannot combine mixed references") $ fmap annotations as
-- |Fan-ins are a 'Capture' iff all items are consistent.
instance Capture a => Capture (All a) where
getBED (All as) = foldl1 (\l r -> if l == r then l else error "cannot combine mixed captures") $ fmap getBED as
instance Show a => Show (All a) where
show (All as) = foldl1 (\l r -> l ++ "," ++ r) $ fmap show as
$(allTransTags ''All)
-- |Datatype to split outputs
data On a = On a Int
instance Compilable a => Compilable (On a) where
compile (On a _) = compile a
instance Pathable a => Pathable (On a) where
paths (On a i) = [paths a !! i]
instance Referenced a => Referenced (On a) where
getRef (On a _) = getRef a
name (On a _) = name a
dbnsfp (On a _) = dbnsfp a
annotations (On a _) = annotations a
instance Capture a => Capture (On a) where
getBED (On a _) = getBED a
instance Show a => Show (On a) where
show (On a i) = "(" ++ show a ++ ")_" ++ show i
$(allTransTags ''On)
on :: Pathable a => a -> Int -> On a
on a i
| i >= 0 && i < length (paths a) = On a i
| otherwise = error "on: index out of bounds"
split :: Pathable a => a -> [On a]
split a = [on a i | i <- [0..n - 1]]
where
n = length $ paths a
-- | Entry point to bioshake. Like 'shakeArgs' but also takes a number of
-- threads to use.
bioshake :: Int -- ^ Number of threads
-> ShakeOptions -- ^ Options to pass to 'shakeArgs'.
-> (Given Resource => Rules ()) -> IO ()
bioshake n opts cont = shakeArgs opts{shakeThreads = n} $ do
res <- newResource "cpus" n
give res cont
-- | Creates a temporary directory under a target directory according to a
-- naming template. The directory is cleaned up after executing the action. This
-- differs from "Development.Shake"'s 'withTempDir' in that it takes a target
-- directory and template whereas "Development.Shake" uses /tmp. This is
-- generally more useful, as ./tmp is used as the target directory by convention
-- in BioShake.
withTempDirectory :: FilePath -- ^ Target directory under which the temporary directory is created
-> String -- ^ Template for the temporary directory name
-> (FilePath -> Action b) -- ^ Action to carry out
-> Action b
withTempDirectory targetDir template act = do
path <- liftIO $ createTempDirectory targetDir template
act path `actionFinally` (liftIO . ignoringIOErrors $ removeDirectoryRecursive path)
ignoringIOErrors :: IO () -> IO ()
ignoringIOErrors ioe = ioe `E.catch` (\e -> const (return ()) (e :: IOError))