ceph: use rbtree for mds requests
[linux-2.6-block.git] / fs / ceph / mds_client.h
CommitLineData
2f2dc053
SW
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
153c8e6b 5#include <linux/kref.h>
2f2dc053
SW
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/radix-tree.h>
44ca18f2 9#include <linux/rbtree.h>
2f2dc053
SW
10#include <linux/spinlock.h>
11
12#include "types.h"
13#include "messenger.h"
14#include "mdsmap.h"
15
16/*
17 * Some lock dependencies:
18 *
19 * session->s_mutex
20 * mdsc->mutex
21 *
22 * mdsc->snap_rwsem
23 *
24 * inode->i_lock
25 * mdsc->snap_flush_lock
26 * mdsc->cap_delay_lock
27 *
28 */
29
30struct ceph_client;
31struct ceph_cap;
32
33/*
34 * parsed info about a single inode. pointers are into the encoded
35 * on-wire structures within the mds reply message payload.
36 */
37struct ceph_mds_reply_info_in {
38 struct ceph_mds_reply_inode *in;
39 u32 symlink_len;
40 char *symlink;
41 u32 xattr_len;
42 char *xattr_data;
43};
44
45/*
46 * parsed info about an mds reply, including information about the
47 * target inode and/or its parent directory and dentry, and directory
48 * contents (for readdir results).
49 */
50struct ceph_mds_reply_info_parsed {
51 struct ceph_mds_reply_head *head;
52
53 struct ceph_mds_reply_info_in diri, targeti;
54 struct ceph_mds_reply_dirfrag *dirfrag;
55 char *dname;
56 u32 dname_len;
57 struct ceph_mds_reply_lease *dlease;
58
59 struct ceph_mds_reply_dirfrag *dir_dir;
60 int dir_nr;
61 char **dir_dname;
62 u32 *dir_dname_len;
63 struct ceph_mds_reply_lease **dir_dlease;
64 struct ceph_mds_reply_info_in *dir_in;
65 u8 dir_complete, dir_end;
66
67 /* encoded blob describing snapshot contexts for certain
68 operations (e.g., open) */
69 void *snapblob;
70 int snapblob_len;
71};
72
73
74/*
75 * cap releases are batched and sent to the MDS en masse.
76 */
77#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
78 sizeof(struct ceph_mds_cap_release)) / \
79 sizeof(struct ceph_mds_cap_item))
80
81
82/*
83 * state associated with each MDS<->client session
84 */
85enum {
86 CEPH_MDS_SESSION_NEW = 1,
87 CEPH_MDS_SESSION_OPENING = 2,
88 CEPH_MDS_SESSION_OPEN = 3,
89 CEPH_MDS_SESSION_HUNG = 4,
90 CEPH_MDS_SESSION_CLOSING = 5,
91 CEPH_MDS_SESSION_RESTARTING = 6,
92 CEPH_MDS_SESSION_RECONNECTING = 7,
93};
94
95struct ceph_mds_session {
96 struct ceph_mds_client *s_mdsc;
97 int s_mds;
98 int s_state;
99 unsigned long s_ttl; /* time until mds kills us */
100 u64 s_seq; /* incoming msg seq # */
101 struct mutex s_mutex; /* serialize session messages */
102
103 struct ceph_connection s_con;
104
4e7a5dcd
SW
105 struct ceph_authorizer *s_authorizer;
106 void *s_authorizer_buf, *s_authorizer_reply_buf;
107 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
108
2f2dc053
SW
109 /* protected by s_cap_lock */
110 spinlock_t s_cap_lock;
111 u32 s_cap_gen; /* inc each time we get mds stale msg */
112 unsigned long s_cap_ttl; /* when session caps expire */
113 struct list_head s_caps; /* all caps issued by this session */
114 int s_nr_caps, s_trim_caps;
115 int s_num_cap_releases;
116 struct list_head s_cap_releases; /* waiting cap_release messages */
117 struct list_head s_cap_releases_done; /* ready to send */
5dacf091 118 bool s_iterating_caps;
2f2dc053
SW
119
120 /* protected by mutex */
121 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
122 struct list_head s_cap_snaps_flushing;
123 unsigned long s_renew_requested; /* last time we sent a renew req */
124 u64 s_renew_seq;
125
126 atomic_t s_ref;
127 struct list_head s_waiting; /* waiting requests */
128 struct list_head s_unsafe; /* unsafe requests */
129};
130
131/*
132 * modes of choosing which MDS to send a request to
133 */
134enum {
135 USE_ANY_MDS,
136 USE_RANDOM_MDS,
137 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
138};
139
140struct ceph_mds_request;
141struct ceph_mds_client;
142
143/*
144 * request completion callback
145 */
146typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
147 struct ceph_mds_request *req);
148
149/*
150 * an in-flight mds request
151 */
152struct ceph_mds_request {
153 u64 r_tid; /* transaction id */
44ca18f2 154 struct rb_node r_node;
2f2dc053
SW
155
156 int r_op; /* mds op code */
157 int r_mds;
158
159 /* operation on what? */
160 struct inode *r_inode; /* arg1 */
161 struct dentry *r_dentry; /* arg1 */
162 struct dentry *r_old_dentry; /* arg2: rename from or link from */
163 char *r_path1, *r_path2;
164 struct ceph_vino r_ino1, r_ino2;
165
166 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
167 struct inode *r_target_inode; /* resulting inode */
168
169 union ceph_mds_request_args r_args;
170 int r_fmode; /* file mode, if expecting cap */
171
172 /* for choosing which mds to send this request to */
173 int r_direct_mode;
174 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
175 bool r_direct_is_hash; /* true if r_direct_hash is valid */
176
177 /* data payload is used for xattr ops */
178 struct page **r_pages;
179 int r_num_pages;
180 int r_data_len;
181
182 /* what caps shall we drop? */
183 int r_inode_drop, r_inode_unless;
184 int r_dentry_drop, r_dentry_unless;
185 int r_old_dentry_drop, r_old_dentry_unless;
186 struct inode *r_old_inode;
187 int r_old_inode_drop, r_old_inode_unless;
188
189 struct ceph_msg *r_request; /* original request */
190 struct ceph_msg *r_reply;
191 struct ceph_mds_reply_info_parsed r_reply_info;
192 int r_err;
5b1daecd 193 bool r_aborted;
2f2dc053
SW
194
195 unsigned long r_timeout; /* optional. jiffies */
196 unsigned long r_started; /* start time to measure timeout against */
197 unsigned long r_request_started; /* start time for mds request only,
198 used to measure lease durations */
199
200 /* link unsafe requests to parent directory, for fsync */
201 struct inode *r_unsafe_dir;
202 struct list_head r_unsafe_dir_item;
203
204 struct ceph_mds_session *r_session;
205
206 int r_attempts; /* resend attempts */
207 int r_num_fwd; /* number of forward attempts */
208 int r_num_stale;
209 int r_resend_mds; /* mds to resend to next, if any*/
210
153c8e6b 211 struct kref r_kref;
2f2dc053
SW
212 struct list_head r_wait;
213 struct completion r_completion;
214 struct completion r_safe_completion;
215 ceph_mds_request_callback_t r_callback;
216 struct list_head r_unsafe_item; /* per-session unsafe list item */
217 bool r_got_unsafe, r_got_safe;
218
219 bool r_did_prepopulate;
220 u32 r_readdir_offset;
221
222 struct ceph_cap_reservation r_caps_reservation;
223 int r_num_caps;
224};
225
226/*
227 * mds client state
228 */
229struct ceph_mds_client {
230 struct ceph_client *client;
231 struct mutex mutex; /* all nested structures */
232
233 struct ceph_mdsmap *mdsmap;
234 struct completion safe_umount_waiters, session_close_waiters;
235 struct list_head waiting_for_map;
236
237 struct ceph_mds_session **sessions; /* NULL for mds if no session */
238 int max_sessions; /* len of s_mds_sessions */
239 int stopping; /* true if shutting down */
240
241 /*
242 * snap_rwsem will cover cap linkage into snaprealms, and
243 * realm snap contexts. (later, we can do per-realm snap
244 * contexts locks..) the empty list contains realms with no
245 * references (implying they contain no inodes with caps) that
246 * should be destroyed.
247 */
248 struct rw_semaphore snap_rwsem;
249 struct radix_tree_root snap_realms;
250 struct list_head snap_empty;
251 spinlock_t snap_empty_lock; /* protect snap_empty */
252
253 u64 last_tid; /* most recent mds request */
44ca18f2 254 struct rb_root request_tree; /* pending mds requests */
2f2dc053
SW
255 struct delayed_work delayed_work; /* delayed work */
256 unsigned long last_renew_caps; /* last time we renewed our caps */
257 struct list_head cap_delay_list; /* caps with delayed release */
258 spinlock_t cap_delay_lock; /* protects cap_delay_list */
259 struct list_head snap_flush_list; /* cap_snaps ready to flush */
260 spinlock_t snap_flush_lock;
261
262 u64 cap_flush_seq;
263 struct list_head cap_dirty; /* inodes with dirty caps */
264 int num_cap_flushing; /* # caps we are flushing */
265 spinlock_t cap_dirty_lock; /* protects above items */
266 wait_queue_head_t cap_flushing_wq;
267
039934b8 268#ifdef CONFIG_DEBUG_FS
2f2dc053 269 struct dentry *debugfs_file;
039934b8 270#endif
2f2dc053
SW
271
272 spinlock_t dentry_lru_lock;
273 struct list_head dentry_lru;
274 int num_dentry;
275};
276
277extern const char *ceph_mds_op_name(int op);
278
279extern struct ceph_mds_session *
280__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
281
282static inline struct ceph_mds_session *
283ceph_get_mds_session(struct ceph_mds_session *s)
284{
285 atomic_inc(&s->s_ref);
286 return s;
287}
288
289extern void ceph_put_mds_session(struct ceph_mds_session *s);
290
291extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
292 struct ceph_msg *msg, int mds);
293
5f44f142 294extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
2f2dc053
SW
295 struct ceph_client *client);
296extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
297extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
298
299extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
300
301extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
302 struct inode *inode,
303 struct dentry *dn, int mask);
304
305extern struct ceph_mds_request *
306ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
307extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
308 struct ceph_mds_request *req);
309extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
310 struct inode *dir,
311 struct ceph_mds_request *req);
312static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
313{
153c8e6b
SW
314 kref_get(&req->r_kref);
315}
316extern void ceph_mdsc_release_request(struct kref *kref);
317static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
318{
319 kref_put(&req->r_kref, ceph_mdsc_release_request);
2f2dc053 320}
2f2dc053
SW
321
322extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
323
324extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
325 int stop_on_nosnap);
326
327extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
328extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
329 struct inode *inode,
330 struct dentry *dentry, char action,
331 u32 seq);
332
333extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
334 struct ceph_msg *msg);
335
336#endif