/*** * * ipv4/tcp/tcp.c - TCP implementation for RTnet * * Copyright (C) 2009 Vladimir Zapolskiy * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License, version 2, as * published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "timerwheel.h" static unsigned int close_timeout = 1000; module_param(close_timeout, uint, 0664); MODULE_PARM_DESC(close_timeout, "max time (ms) to wait during close for FIN-ACK handshake to complete, default 1000"); #ifdef CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION static unsigned int error_rate; module_param(error_rate, uint, 0664); MODULE_PARM_DESC(error_rate, "simulate packet loss after every n packets"); static unsigned int multi_error = 1; module_param(multi_error, uint, 0664); MODULE_PARM_DESC(multi_error, "on simulated error, drop n packets in a row"); static unsigned int counter_start = 1234; module_param(counter_start, uint, 0664); MODULE_PARM_DESC(counter_start, "start value of per-socket packet counter " "(used for error injection)"); #endif /* CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION */ struct tcp_sync { u32 seq; u32 ack_seq; /* Local window size sent to peer */ u16 window; /* Last received destination peer window size */ u16 dst_window; }; /* connection timeout */ /* 5 second */ static const nanosecs_rel_t rt_tcp_connection_timeout = 1000000000ull; /* retransmission timerwheel timeout */ static const u64 rt_tcp_retransmit_timeout = 100000000ull; /* keepalive constants */ /* 75 second */ static const u64 rt_tcp_keepalive_intvl = 75000000000ull; /* 9 probes to send */ static const u8 rt_tcp_keepalive_probes = 9; /* 2 hour */ static const u64 rt_tcp_keepalive_timeout = 7200000000000ull; /* retransmission timeout */ /* 50 millisecond */ static const nanosecs_rel_t rt_tcp_retransmission_timeout = 50000000ull; /* maximum allowed number of retransmissions */ static const unsigned int max_retransmits = 3; struct tcp_keepalive { u8 enabled; u32 probes; rtdm_timer_t timer; }; /*** * This structure is used to register a TCP socket for reception. All * structures are kept in the port_registry array to increase the cache * locality during the critical port lookup in rt_tcp_v4_lookup(). */ /* if dport & daddr are zeroes, it means a listening socket */ /* otherwise this is a data structure, which describes a connection */ /* NB: sock->prot.inet.saddr & sock->prot.inet.sport values are not used */ struct tcp_socket { struct rtsocket sock; /* set up by rt_socket_init() implicitly */ u16 sport; /* local port */ u32 saddr; /* local ip-addr */ u16 dport; /* destination port */ u32 daddr; /* destination ip-addr */ u8 tcp_state; /* tcp connection state */ u8 is_binding; /* if set, tcp socket is in port binding progress */ u8 is_bound; /* if set, tcp socket is already port bound */ u8 is_valid; /* if set, read() and write() can process */ u8 is_accepting; /* if set, accept() is in progress */ u8 is_accepted; /* if set, accept() is already called */ u8 is_closed; /* close() call for resource deallocation follows */ rtdm_event_t send_evt; /* write request is permissible */ rtdm_event_t conn_evt; /* connection event */ struct dest_route rt; struct tcp_sync sync; struct tcp_keepalive keepalive; rtdm_lock_t socket_lock; struct hlist_node link; nanosecs_rel_t sk_sndtimeo; /* retransmission routine data */ u32 nacked_first; unsigned int timer_state; struct rtskb_queue retransmit_queue; struct timerwheel_timer timer; struct completion fin_handshake; rtdm_nrtsig_t close_sig; #ifdef CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION unsigned int packet_counter; unsigned int error_rate; unsigned int multi_error; #endif /* CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION */ }; struct rt_tcp_dispatched_packet_send_cmd { __be32 flags; /* packet flags value */ struct tcp_socket *ts; }; /*** * Automatic port number assignment * The automatic assignment of port numbers to unbound sockets is realised as * a simple addition of two values: * - the socket ID (lower 8 bits of file descriptor) which is set during * initialisation and left unchanged afterwards * - the start value tcp_auto_port_start which is a module parameter * tcp_auto_port_mask, also a module parameter, is used to define the range of * port numbers which are used for automatic assignment. Any number within * this range will be rejected when passed to bind_rt(). */ MODULE_LICENSE("GPL"); static struct { struct rtdm_dev_context dummy; struct tcp_socket rst_socket; } rst_socket_container; #define rst_fd (&rst_socket_container.dummy.fd) #define rst_socket (*(struct tcp_socket *)rtdm_fd_to_private(rst_fd)) static u32 tcp_auto_port_start = 1024; static u32 tcp_auto_port_mask = ~(RT_TCP_SOCKETS - 1); static u32 free_ports = RT_TCP_SOCKETS; #define RT_PORT_BITMAP_WORDS \ ((RT_TCP_SOCKETS + BITS_PER_LONG - 1) / BITS_PER_LONG) static unsigned long port_bitmap[RT_PORT_BITMAP_WORDS]; static struct tcp_socket *port_registry[RT_TCP_SOCKETS]; static DEFINE_RTDM_LOCK(tcp_socket_base_lock); static struct hlist_head port_hash[RT_TCP_SOCKETS * 2]; #define port_hash_mask (RT_TCP_SOCKETS * 2 - 1) module_param(tcp_auto_port_start, uint, 0444); module_param(tcp_auto_port_mask, uint, 0444); MODULE_PARM_DESC(tcp_auto_port_start, "Start of automatically assigned " "port range for TCP"); MODULE_PARM_DESC(tcp_auto_port_mask, "Mask that defines port range for TCP " "for automatic assignment"); static inline struct tcp_socket *port_hash_search(u32 saddr, u16 sport) { u32 bucket = sport & port_hash_mask; struct tcp_socket *ts; hlist_for_each_entry (ts, &port_hash[bucket], link) if (ts->sport == sport && (saddr == INADDR_ANY || ts->saddr == saddr || ts->saddr == INADDR_ANY)) return ts; return NULL; } static int port_hash_insert(struct tcp_socket *ts, u32 saddr, u16 sport) { u32 bucket; if (port_hash_search(saddr, sport)) return -EADDRINUSE; bucket = sport & port_hash_mask; ts->saddr = saddr; ts->sport = sport; ts->daddr = 0; ts->dport = 0; hlist_add_head(&ts->link, &port_hash[bucket]); return 0; } static inline void port_hash_del(struct tcp_socket *ts) { hlist_del(&ts->link); } /*** * rt_tcp_v4_lookup */ static struct rtsocket *rt_tcp_v4_lookup(u32 daddr, u16 dport) { rtdm_lockctx_t context; struct tcp_socket *ts; int ret; rtdm_lock_get_irqsave(&tcp_socket_base_lock, context); ts = port_hash_search(daddr, dport); if (ts != NULL) { ret = rt_socket_reference(&ts->sock); if (ret == 0 || (ret == -EIDRM && ts->is_closed)) { rtdm_lock_put_irqrestore(&tcp_socket_base_lock, context); return &ts->sock; } } rtdm_lock_put_irqrestore(&tcp_socket_base_lock, context); return NULL; } /* test seq1 <= seq2 */ static inline int rt_tcp_before(__u32 seq1, __u32 seq2) { return (__s32)(seq1 - seq2) <= 0; } /* test seq1 => seq2 */ static inline int rt_tcp_after(__u32 seq1, __u32 seq2) { return (__s32)(seq2 - seq1) <= 0; } static inline u32 rt_tcp_compute_ack_seq(struct tcphdr *th, u32 len) { u32 ack_seq = ntohl(th->seq) + len; if (unlikely(th->syn || th->fin)) ack_seq++; return ack_seq; } static void rt_tcp_keepalive_start(struct tcp_socket *ts) { if (ts->tcp_state == TCP_ESTABLISHED) { rtdm_timer_start(&ts->keepalive.timer, rt_tcp_keepalive_timeout, 0, RTDM_TIMERMODE_RELATIVE); } } static void rt_tcp_keepalive_stop(struct tcp_socket *ts) { if (ts->tcp_state == TCP_ESTABLISHED) { rtdm_timer_stop(&ts->keepalive.timer); } } #ifdef YET_UNUSED static void rt_tcp_keepalive_timer(rtdm_timer_t *timer); static void rt_tcp_keepalive_enable(struct tcp_socket *ts) { rtdm_lockctx_t context; struct tcp_keepalive *keepalive; rtdm_lock_get_irqsave(&ts->socket_lock, context); keepalive = &ts->keepalive; if (keepalive->enabled) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return; } keepalive->probes = rt_tcp_keepalive_probes; rtdm_timer_init(&keepalive->timer, rt_tcp_keepalive_timer, "RT TCP keepalive timer"); rt_tcp_keepalive_start(ts); keepalive->enabled = 1; rtdm_lock_put_irqrestore(&ts->socket_lock, context); } #endif static void rt_tcp_keepalive_disable(struct tcp_socket *ts) { struct tcp_keepalive *keepalive; keepalive = &ts->keepalive; if (!keepalive->enabled) { return; } rt_tcp_keepalive_stop(ts); rtdm_timer_destroy(&keepalive->timer); keepalive->enabled = 0; } static void rt_tcp_keepalive_feed(struct tcp_socket *ts) { rtdm_lockctx_t context; struct tcp_keepalive *keepalive; rtdm_lock_get_irqsave(&ts->socket_lock, context); keepalive = &ts->keepalive; if (ts->tcp_state == TCP_ESTABLISHED && ts->keepalive.enabled) { keepalive->probes = rt_tcp_keepalive_probes; /* Restart keepalive timer */ rtdm_timer_stop(&keepalive->timer); rtdm_timer_start(&keepalive->timer, rt_tcp_keepalive_timeout, 0, RTDM_TIMERMODE_RELATIVE); rtdm_lock_put_irqrestore(&ts->socket_lock, context); } else { rtdm_lock_put_irqrestore(&ts->socket_lock, context); } } static int rt_tcp_socket_invalidate(struct tcp_socket *ts, u8 to_state) { int signal = ts->is_valid; ts->tcp_state = to_state; /* multiple invalidation could happen without fuss, see rt_tcp_close(), rt_tcp_rcv(), timeout expiration etc. */ if (ts->is_valid) { ts->is_valid = 0; if (ts->keepalive.enabled) { rt_tcp_keepalive_stop(ts); } } return signal; } static void rt_tcp_socket_invalidate_signal(struct tcp_socket *ts) { /* awake all readers and writers destroying events */ rtdm_sem_destroy(&ts->sock.pending_sem); rtdm_event_destroy(&ts->send_evt); } static void rt_tcp_socket_validate(struct tcp_socket *ts) { ts->tcp_state = TCP_ESTABLISHED; ts->is_valid = 1; if (ts->keepalive.enabled) { rt_tcp_keepalive_start(ts); } rtdm_event_init(&ts->send_evt, 0); } /*** * rt_tcp_retransmit_handler - timerwheel handler to process a retransmission * @data: pointer to a rttcp socket structure */ static void rt_tcp_retransmit_handler(void *data) { struct tcp_socket *ts = (struct tcp_socket *)data; struct rtskb *skb; rtdm_lockctx_t context; int signal; rtdm_lock_get_irqsave(&ts->socket_lock, context); if (unlikely(rtskb_queue_empty(&ts->retransmit_queue))) { /* handled, but retransmission queue is empty */ rtdm_lock_get_irqsave(&ts->socket_lock, context); rtdm_printk("rttcp: bug in RT TCP retransmission routine\n"); return; } if (ts->tcp_state == TCP_CLOSE) { /* socket is already closed */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); return; } if (ts->timer_state) { /* more tries */ ts->timer_state--; timerwheel_add_timer(&ts->timer, rt_tcp_retransmission_timeout); /* warning, rtskb_clone is under lock */ skb = rtskb_clone(ts->retransmit_queue.first, &ts->sock.skb_pool); rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* BUG, window changes are not respected */ if (unlikely(rtdev_xmit(skb)) != 0) { kfree_rtskb(skb); rtdm_printk( "rttcp: packet retransmission from timer failed\n"); } } else { ts->timer_state = max_retransmits; /* report about connection lost */ signal = rt_tcp_socket_invalidate(ts, TCP_CLOSE); rtdm_lock_put_irqrestore(&ts->socket_lock, context); if (signal) rt_tcp_socket_invalidate_signal(ts); /* retransmission queue will be cleaned up in rt_tcp_socket_destruct */ rtdm_printk("rttcp: connection is lost by NACK timeout\n"); } } /*** * rt_tcp_retransmit_ack - remove skbs from retransmission queue on ACK * @ts: rttcp socket * @ack_seq: received ACK sequence value */ static void rt_tcp_retransmit_ack(struct tcp_socket *ts, u32 ack_seq) { struct rtskb *skb; rtdm_lockctx_t context; rtdm_lock_get_irqsave(&ts->socket_lock, context); /* ACK, but retransmission queue is empty This could happen on repeated ACKs */ if (rtskb_queue_empty(&ts->retransmit_queue)) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return; } /* Check ts->nacked_first value firstly to ensure that skb for retransmission is present in the queue, otherwise retransmission queue will be drained completely */ if (!rt_tcp_before(ts->nacked_first, ack_seq)) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return; } if (timerwheel_remove_timer(&ts->timer) != 0) { /* already timed out */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); return; } dequeue_loop: if (ts->tcp_state == TCP_CLOSE) { /* warn about queue safety in race with anyone, who closes the socket */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); return; } if ((skb = __rtskb_dequeue(&ts->retransmit_queue)) == NULL) { ts->timer_state = max_retransmits; rtdm_lock_put_irqrestore(&ts->socket_lock, context); return; } if (rt_tcp_before(ts->nacked_first, ack_seq)) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); kfree_rtskb(skb); rtdm_lock_get_irqsave(&ts->socket_lock, context); goto dequeue_loop; } /* Put NACKed skb back to queue */ /* BUG, need to respect half-acknowledged packets */ ts->nacked_first = ntohl(skb->h.th->seq) + 1; __rtskb_queue_head(&ts->retransmit_queue, skb); /* Have more packages in retransmission queue, restart the timer */ timerwheel_add_timer(&ts->timer, rt_tcp_retransmission_timeout); rtdm_lock_put_irqrestore(&ts->socket_lock, context); } /*** * rt_tcp_retransmit_send - enqueue a skb to retransmission queue (not locked) * @ts: rttcp socket * @skb: a copied skb for enqueueing */ static void rt_tcp_retransmit_send(struct tcp_socket *ts, struct rtskb *skb) { if (rtskb_queue_empty(&ts->retransmit_queue)) { /* retransmission queue is empty */ ts->nacked_first = ntohl(skb->h.th->seq) + 1; __rtskb_queue_tail(&ts->retransmit_queue, skb); timerwheel_add_timer(&ts->timer, rt_tcp_retransmission_timeout); } else { /* retransmission queue is not empty */ __rtskb_queue_tail(&ts->retransmit_queue, skb); } } static int rt_ip_build_frame(struct rtskb *skb, struct rtsocket *sk, struct dest_route *rt, struct iphdr *iph) { int ret; struct rtnet_device *rtdev = rt->rtdev; RTNET_ASSERT(rtdev->hard_header, return -EBADF;); if (!rtdev_reference(rt->rtdev)) return -EIDRM; iph->ihl = 5; /* 20 byte header only - no TCP options */ skb->nh.iph = iph; iph->version = 4; iph->tos = sk->prot.inet.tos; iph->tot_len = htons(skb->len); /* length of IP header and IP payload */ iph->id = htons(0x00); /* zero IP frame id */ iph->frag_off = htons(IP_DF); /* and no more frames */ iph->ttl = 255; iph->protocol = sk->protocol; iph->saddr = rtdev->local_ip; iph->daddr = rt->ip; iph->check = 0; /* required to compute correct checksum */ iph->check = ip_fast_csum((u8 *)iph, 5 /*iph->ihl*/); ret = rtdev->hard_header(skb, rtdev, ETH_P_IP, rt->dev_addr, rtdev->dev_addr, skb->len); rtdev_dereference(rt->rtdev); if (ret != rtdev->hard_header_len) { rtdm_printk("rttcp: rt_ip_build_frame: error on lower level\n"); return -EINVAL; } return 0; } static void rt_tcp_build_header(struct tcp_socket *ts, struct rtskb *skb, __be32 flags, u8 is_keepalive) { u32 wcheck; u8 tcphdrlen = 20; u8 iphdrlen = 20; struct tcphdr *th; th = skb->h.th; th->source = ts->sport; th->dest = ts->dport; th->seq = htonl(ts->sync.seq); if (unlikely(is_keepalive)) th->seq--; tcp_flag_word(th) = flags; th->ack_seq = htonl(ts->sync.ack_seq); th->window = htons(ts->sync.window); th->doff = tcphdrlen >> 2; /* No options for now */ th->res1 = 0; th->check = 0; th->urg_ptr = 0; /* compute checksum */ wcheck = rtnet_csum(th, tcphdrlen, 0); if (skb->len - tcphdrlen - iphdrlen) { wcheck = rtnet_csum(skb->data + tcphdrlen + iphdrlen, skb->len - tcphdrlen - iphdrlen, wcheck); } th->check = tcp_v4_check(skb->len - iphdrlen, ts->saddr, ts->daddr, wcheck); } static int rt_tcp_segment(struct dest_route *rt, struct tcp_socket *ts, __be32 flags, u32 data_len, u8 *data_ptr, u8 is_keepalive) { struct tcphdr *th; struct rtsocket *sk = &ts->sock; struct rtnet_device *rtdev = rt->rtdev; struct rtskb *skb; struct iphdr *iph; struct rtskb *cloned_skb; rtdm_lockctx_t context; int ret; u32 hh_len = (rtdev->hard_header_len + 15) & ~15; u32 prio = (volatile unsigned int)sk->priority; u32 mtu = rtdev->get_mtu(rtdev, prio); u8 *data = NULL; if ((skb = alloc_rtskb(mtu + hh_len + 15, &sk->skb_pool)) == NULL) { rtdm_printk( "rttcp: no more elements in skb_pool for allocation\n"); return -ENOBUFS; } /* rtskb_reserve(skb, hh_len + 20); */ rtskb_reserve(skb, hh_len); iph = (struct iphdr *)rtskb_put(skb, 20); /* length of IP header */ skb->nh.iph = iph; th = (struct tcphdr *)rtskb_put(skb, 20); /* length of TCP header */ skb->h.th = th; if (data_len) { /* check for available place */ data = (u8 *)rtskb_put(skb, data_len); /* length of TCP payload */ if (!memcpy(data, (void *)data_ptr, data_len)) { ret = -EFAULT; goto error; } } /* used local phy MTU value */ if (data_len > mtu) data_len = mtu; skb->rtdev = rtdev; skb->priority = prio; /* do not validate socket connection on xmit this should be done at upper level */ rtdm_lock_get_irqsave(&ts->socket_lock, context); rt_tcp_build_header(ts, skb, flags, is_keepalive); if ((ret = rt_ip_build_frame(skb, sk, rt, iph)) != 0) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto error; } /* add rtskb entry to the socket retransmission queue */ if (ts->tcp_state != TCP_CLOSE && ((flags & (TCP_FLAG_SYN | TCP_FLAG_FIN)) || data_len)) { /* rtskb_clone below is called under lock, this is an admission, because for now there is no rtskb copy by reference */ cloned_skb = rtskb_clone(skb, &ts->sock.skb_pool); if (!cloned_skb) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdm_printk("rttcp: cann't clone skb\n"); ret = -ENOMEM; goto error; } rt_tcp_retransmit_send(ts, cloned_skb); } /* need to update sync here, because it is safe way in comparison with races on fast ACK response */ if (flags & (TCP_FLAG_FIN | TCP_FLAG_SYN)) ts->sync.seq++; ts->sync.seq += data_len; ts->sync.dst_window -= data_len; rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* ignore return value from rtdev_xmit */ /* the packet was enqueued and on error will be retransmitted later */ /* on critical error after retransmission timeout the connection will be closed by connection lost */ rtdev_xmit(skb); return data_len; error: kfree_rtskb(skb); return ret; } static int rt_tcp_send(struct tcp_socket *ts, __be32 flags) { struct dest_route rt; int ret; /* * We may not have a route yet during setup. But once it is set, it stays * until the socket died. */ if (likely(ts->rt.rtdev)) { ret = rt_tcp_segment(&ts->rt, ts, flags, 0, NULL, 0); } else { ret = rt_ip_route_output(&rt, ts->daddr, ts->saddr); if (ret == 0) { ret = rt_tcp_segment(&rt, ts, flags, 0, NULL, 0); rtdev_dereference(rt.rtdev); } } if (ret < 0) rtdm_printk("rttcp: can't send a packet: err %d\n", -ret); return ret; } #ifdef YET_UNUSED static void rt_tcp_keepalive_timer(rtdm_timer_t *timer) { rtdm_lockctx_t context; struct tcp_keepalive *keepalive = container_of(timer, struct tcp_keepalive, timer); struct tcp_socket *ts = container_of(keepalive, struct tcp_socket, keepalive); int signal = 0; rtdm_lock_get_irqsave(&ts->socket_lock, context); if (keepalive->probes) { /* Send a probe */ if (rt_tcp_segment(&ts->rt, ts, 0, 0, NULL, 1) < 0) { /* data receiving and sending is not possible anymore */ signal = rt_tcp_socket_invalidate(ts, TCP_TIME_WAIT); rtdm_lock_put_irqrestore(&ts->socket_lock, context); } keepalive->probes--; rtdm_timer_start_in_handler(&keepalive->timer, rt_tcp_keepalive_intvl, 0, RTDM_TIMERMODE_RELATIVE); rtdm_lock_put_irqrestore(&ts->socket_lock, context); } else { /* data receiving and sending is not possible anymore */ signal = rt_tcp_socket_invalidate(ts, TCP_TIME_WAIT); rtdm_lock_put_irqrestore(&ts->socket_lock, context); } if (signal) rt_tcp_socket_invalidate_signal(ts); } #endif static inline u32 rt_tcp_initial_seq(void) { uint64_t clock_val = rtdm_clock_read_monotonic(); return (u32)(clock_val ^ (clock_val >> 32)); } /*** * rt_tcp_dest_socket */ static struct rtsocket *rt_tcp_dest_socket(struct rtskb *skb) { struct tcphdr *th = skb->h.th; u32 saddr = skb->nh.iph->saddr; u32 daddr = skb->nh.iph->daddr; u32 sport = th->source; u32 dport = th->dest; u32 data_len; if (tcp_v4_check(skb->len, saddr, daddr, rtnet_csum(skb->data, skb->len, 0))) { rtdm_printk("rttcp: invalid TCP packet checksum, dropped\n"); return NULL; /* Invalid checksum, drop the packet */ } /* find the destination socket */ if ((skb->sk = rt_tcp_v4_lookup(daddr, dport)) == NULL) { /* rtdm_printk("Not found addr:0x%08x, port: 0x%04x\n", daddr, dport); */ if (!th->rst) { /* No listening socket found, send RST|ACK */ rst_socket.saddr = daddr; rst_socket.daddr = saddr; rst_socket.sport = dport; rst_socket.dport = sport; data_len = skb->len - (th->doff << 2); rst_socket.sync.seq = 0; rst_socket.sync.ack_seq = rt_tcp_compute_ack_seq(th, data_len); if (rt_ip_route_output(&rst_socket.rt, daddr, saddr) == 0) { rt_socket_reference(&rst_socket.sock); rt_tcp_send(&rst_socket, TCP_FLAG_ACK | TCP_FLAG_RST); rtdev_dereference(rst_socket.rt.rtdev); } } } return skb->sk; } static void rt_tcp_window_update(struct tcp_socket *ts, u16 window) { rtdm_lockctx_t context; rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->sync.dst_window) { ts->sync.dst_window = window; rtdm_lock_put_irqrestore(&ts->socket_lock, context); if (!window) { /* clear send event status */ rtdm_event_clear(&ts->send_evt); } } else { if (window) { ts->sync.dst_window = window; rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* set send event status */ rtdm_event_signal(&ts->send_evt); } else { rtdm_lock_put_irqrestore(&ts->socket_lock, context); } } } /*** * rt_tcp_rcv */ static void rt_tcp_rcv(struct rtskb *skb) { rtdm_lockctx_t context; struct tcp_socket *ts; struct tcphdr *th = skb->h.th; unsigned int data_len = skb->len - (th->doff << 2); u32 seq = ntohl(th->seq); int signal; ts = container_of(skb->sk, struct tcp_socket, sock); rtdm_lock_get_irqsave(&ts->socket_lock, context); #ifdef CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION if (ts->error_rate > 0) { if ((ts->packet_counter++ % error_rate) < ts->multi_error) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto drop; } } #endif /* CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION */ /* Check for daddr/dport correspondence to values stored in selected socket from hash */ if (ts->tcp_state != TCP_LISTEN && (ts->daddr != skb->nh.iph->saddr || ts->dport != skb->h.th->source)) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto drop; } /* Check if it is a keepalive probe */ if (ts->sync.ack_seq == (seq + 1) && ts->tcp_state == TCP_ESTABLISHED) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_ACK); goto feed; } if (ts->tcp_state == TCP_SYN_SENT) { ts->sync.ack_seq = rt_tcp_compute_ack_seq(th, data_len); if (th->syn && th->ack) { rt_tcp_socket_validate(ts); rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdm_event_signal(&ts->conn_evt); /* Send ACK */ rt_tcp_send(ts, TCP_FLAG_ACK); goto feed; } ts->tcp_state = TCP_CLOSE; ts->sync.seq = ntohl(th->ack_seq); rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* Send RST|ACK */ rtdm_event_signal(&ts->conn_evt); rt_tcp_send(ts, TCP_FLAG_RST | TCP_FLAG_ACK); goto drop; } /* Check for SEQ correspondence to determine the connection relevance */ /* OR-list of conditions to be satisfied: * * th->ack && rt_tcp_after(ts->nacked_first, ntohl(th->ack_seq)) * th->ack && th->rst && ... * th->syn && (ts->tcp_state == TCP_LISTEN || ts->tcp_state == TCP_SYN_SENT) * rt_tcp_after(seq, ts->sync.ack_seq) && rt_tcp_before(seq, ts->sync.ack_seq + ts->sync.window) */ if ((rt_tcp_after(seq, ts->sync.ack_seq) && rt_tcp_before(seq, ts->sync.ack_seq + ts->sync.window)) || th->rst || (th->syn && (ts->tcp_state == TCP_LISTEN || ts->tcp_state == TCP_SYN_SENT))) { /* everything is ok */ } else if (rt_tcp_after(seq, ts->sync.ack_seq - data_len)) { /* retransmission of data we already acked */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_ACK); goto drop; } else { /* drop forward ack */ if (th->ack && /* but reset ack from old connection */ ts->tcp_state == TCP_ESTABLISHED) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdm_printk( "rttcp: dropped unappropriate ACK packet %u\n", ts->sync.ack_seq); goto drop; } rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdm_printk("rttcp: sequence number is not in window, " "dropped (failed: %u <= %u <= %u)\n", ts->sync.ack_seq, seq, ts->sync.ack_seq + ts->sync.window); /* That's a forced RST for a lost connection */ rst_socket.saddr = skb->nh.iph->daddr; rst_socket.daddr = skb->nh.iph->saddr; rst_socket.sport = th->dest; rst_socket.dport = th->source; rst_socket.sync.seq = ntohl(th->ack_seq); rst_socket.sync.ack_seq = rt_tcp_compute_ack_seq(th, data_len); if (rt_ip_route_output(&rst_socket.rt, rst_socket.daddr, rst_socket.saddr) == 0) { rt_socket_reference(&rst_socket.sock); rt_tcp_send(&rst_socket, TCP_FLAG_RST | TCP_FLAG_ACK); rtdev_dereference(rst_socket.rt.rtdev); } goto drop; } if (th->rst) { if (ts->tcp_state == TCP_SYN_RECV) { ts->tcp_state = TCP_LISTEN; rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto drop; } else { /* Drop our half-open connection, peer obviously went away. */ signal = rt_tcp_socket_invalidate(ts, TCP_CLOSE); rtdm_lock_put_irqrestore(&ts->socket_lock, context); if (signal) rt_tcp_socket_invalidate_signal(ts); goto drop; } } ts->sync.ack_seq = rt_tcp_compute_ack_seq(th, data_len); if (th->fin) { if (ts->tcp_state == TCP_ESTABLISHED) { /* Send ACK */ signal = rt_tcp_socket_invalidate(ts, TCP_CLOSE_WAIT); rtdm_lock_put_irqrestore(&ts->socket_lock, context); if (signal) rt_tcp_socket_invalidate_signal(ts); rt_tcp_send(ts, TCP_FLAG_ACK); goto feed; } else if ((ts->tcp_state == TCP_FIN_WAIT1 && th->ack) || ts->tcp_state == TCP_FIN_WAIT2) { /* Send ACK */ ts->tcp_state = TCP_TIME_WAIT; rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_ACK); /* data receiving is not possible anymore */ rtdm_sem_destroy(&ts->sock.pending_sem); rtdm_nrtsig_pend(&ts->close_sig); goto feed; } else if (ts->tcp_state == TCP_FIN_WAIT1) { /* Send ACK */ ts->tcp_state = TCP_CLOSING; rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_ACK); /* data receiving is not possible anymore */ rtdm_sem_destroy(&ts->sock.pending_sem); goto feed; } else { /* just drop it */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto drop; } } if (th->syn) { /* Need to differentiate LISTEN socket from ESTABLISHED one */ /* Both of them have the same sport/saddr, but different dport/daddr */ /* dport is unknown if it is the first connection of n */ if (ts->tcp_state == TCP_LISTEN) { /* Need to store ts->seq while sending SYN earlier */ /* The socket shall be in TCP_LISTEN state */ /* safe to update ts->saddr here due to a single task for rt_tcp_rcv() and rt_tcp_dest_socket() callers */ ts->saddr = skb->nh.iph->daddr; ts->daddr = skb->nh.iph->saddr; ts->dport = th->source; ts->sync.seq = rt_tcp_initial_seq(); ts->sync.window = 4096; ts->tcp_state = TCP_SYN_RECV; rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* Send SYN|ACK */ rt_tcp_send(ts, TCP_FLAG_SYN | TCP_FLAG_ACK); goto drop; } /* Send RST|ACK */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_RST | TCP_FLAG_ACK); goto drop; } /* ACK received without SYN, FIN or RST flags */ if (th->ack) { /* Check ack sequence */ if (rt_tcp_before(ts->sync.seq + 1, ntohl(th->ack_seq))) { rtdm_printk("rttcp: unexpected ACK %u %u %u\n", ts->sync.seq, ts->nacked_first, ntohl(th->ack_seq)); rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto drop; } if (ts->tcp_state == TCP_LAST_ACK) { /* close connection and free socket data */ ts->tcp_state = TCP_CLOSE; rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* socket destruction will be done on close() */ rtdm_nrtsig_pend(&ts->close_sig); goto drop; } else if (ts->tcp_state == TCP_FIN_WAIT1) { ts->tcp_state = TCP_FIN_WAIT2; rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto feed; } else if (ts->tcp_state == TCP_SYN_RECV) { rt_tcp_socket_validate(ts); rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdm_event_signal(&ts->conn_evt); goto feed; } else if (ts->tcp_state == TCP_CLOSING) { ts->tcp_state = TCP_TIME_WAIT; rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* socket destruction will be done on close() */ rtdm_nrtsig_pend(&ts->close_sig); goto feed; } } if (ts->tcp_state != TCP_ESTABLISHED) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto drop; } if (data_len == 0) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); goto feed; } /* Send ACK */ ts->sync.window -= data_len; rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_ACK); rtskb_queue_tail(&skb->sk->incoming, skb); rtdm_sem_up(&ts->sock.pending_sem); /* inform retransmission subsystem about arrived ack */ if (th->ack) { rt_tcp_retransmit_ack(ts, ntohl(th->ack_seq)); } rt_tcp_keepalive_feed(ts); rt_tcp_window_update(ts, ntohs(th->window)); return; feed: /* inform retransmission subsystem about arrived ack */ if (th->ack) { rt_tcp_retransmit_ack(ts, ntohl(th->ack_seq)); } rt_tcp_keepalive_feed(ts); rt_tcp_window_update(ts, ntohs(th->window)); drop: kfree_rtskb(skb); return; } /*** * rt_tcp_rcv_err */ static void rt_tcp_rcv_err(struct rtskb *skb) { rtdm_printk("rttcp: rt_tcp_rcv err\n"); } static int rt_tcp_window_send(struct tcp_socket *ts, u32 data_len, u8 *data_ptr) { u32 dst_window = ts->sync.dst_window; int ret; if (data_len > dst_window) data_len = dst_window; if ((ret = rt_tcp_segment(&ts->rt, ts, TCP_FLAG_ACK, data_len, data_ptr, 0)) < 0) { rtdm_printk("rttcp: cann't send a packet: err %d\n", -ret); return ret; } return ret; } static void rt_tcp_close_signal_handler(rtdm_nrtsig_t *nrtsig, void *arg) { complete_all((struct completion *)arg); } static int rt_tcp_socket_create(struct tcp_socket *ts) { rtdm_lockctx_t context; int i; int index; struct rtsocket *sock = &ts->sock; sock->prot.inet.saddr = INADDR_ANY; sock->prot.inet.state = TCP_CLOSE; sock->prot.inet.tos = 0; /* rtdm_printk("rttcp: rt_tcp_socket_create 0x%p\n", ts); */ rtdm_lock_init(&ts->socket_lock); ts->rt.rtdev = NULL; ts->tcp_state = TCP_CLOSE; ts->is_accepting = 0; ts->is_accepted = 0; ts->is_binding = 0; ts->is_bound = 0; ts->is_valid = 0; ts->is_closed = 0; ts->sk_sndtimeo = RTDM_TIMEOUT_INFINITE; rtdm_event_init(&ts->conn_evt, 0); ts->keepalive.enabled = 0; ts->timer_state = max_retransmits; timerwheel_init_timer(&ts->timer, rt_tcp_retransmit_handler, ts); rtskb_queue_init(&ts->retransmit_queue); init_completion(&ts->fin_handshake); rtdm_nrtsig_init(&ts->close_sig, rt_tcp_close_signal_handler, &ts->fin_handshake); #ifdef CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION ts->packet_counter = counter_start; ts->error_rate = error_rate; ts->multi_error = multi_error; #endif /* CONFIG_XENO_DRIVERS_NET_RTIPV4_TCP_ERROR_INJECTION */ rtdm_lock_get_irqsave(&tcp_socket_base_lock, context); /* enforce maximum number of TCP sockets */ if (free_ports == 0) { rtdm_lock_put_irqrestore(&tcp_socket_base_lock, context); rtdm_nrtsig_destroy(&ts->close_sig); return -EAGAIN; } free_ports--; /* find free auto-port in bitmap */ for (i = 0; i < RT_PORT_BITMAP_WORDS; i++) if (port_bitmap[i] != (unsigned long)-1) break; index = ffz(port_bitmap[i]); set_bit(index, &port_bitmap[i]); index += i * 32; sock->prot.inet.reg_index = index; sock->prot.inet.sport = index + tcp_auto_port_start; /* register TCP socket */ port_registry[index] = ts; port_hash_insert(ts, INADDR_ANY, sock->prot.inet.sport); rtdm_lock_put_irqrestore(&tcp_socket_base_lock, context); return 0; } /*** * rt_tcp_socket - create a new TCP-Socket * @s: socket */ static int rt_tcp_socket(struct rtdm_fd *fd) { struct tcp_socket *ts = rtdm_fd_to_private(fd); int ret; if ((ret = rt_socket_init(fd, IPPROTO_TCP)) != 0) return ret; if ((ret = rt_tcp_socket_create(ts)) != 0) rt_socket_cleanup(fd); return ret; } static int rt_tcp_dispatched_packet_send(struct rt_proc_call *call) { int ret; struct rt_tcp_dispatched_packet_send_cmd *cmd; cmd = rtpc_get_priv(call, struct rt_tcp_dispatched_packet_send_cmd); ret = rt_tcp_send(cmd->ts, cmd->flags); return ret; } /*** * rt_tcp_socket_destruct * this function requires non realtime context */ static void rt_tcp_socket_destruct(struct tcp_socket *ts) { rtdm_lockctx_t context; struct rtskb *skb; int index; int signal; struct rtsocket *sock = &ts->sock; /* rtdm_printk("rttcp: rt_tcp_socket_destruct 0x%p\n", ts); */ rtdm_lock_get_irqsave(&tcp_socket_base_lock, context); if (sock->prot.inet.reg_index >= 0) { index = sock->prot.inet.reg_index; clear_bit(index % BITS_PER_LONG, &port_bitmap[index / BITS_PER_LONG]); port_hash_del(port_registry[index]); free_ports++; sock->prot.inet.reg_index = -1; } rtdm_lock_put_irqrestore(&tcp_socket_base_lock, context); rtdm_lock_get_irqsave(&ts->socket_lock, context); signal = rt_tcp_socket_invalidate(ts, TCP_CLOSE); rt_tcp_keepalive_disable(ts); sock->prot.inet.state = TCP_CLOSE; /* dereference rtdev */ if (ts->rt.rtdev != NULL) { rtdev_dereference(ts->rt.rtdev); ts->rt.rtdev = NULL; } rtdm_lock_put_irqrestore(&ts->socket_lock, context); if (signal) rt_tcp_socket_invalidate_signal(ts); rtdm_event_destroy(&ts->conn_evt); rtdm_nrtsig_destroy(&ts->close_sig); /* cleanup already collected fragments */ rt_ip_frag_invalidate_socket(sock); /* free packets in incoming queue */ while ((skb = rtskb_dequeue(&sock->incoming)) != NULL) kfree_rtskb(skb); /* ensure that the timer is no longer running */ timerwheel_remove_timer_sync(&ts->timer); /* free packets in retransmission queue */ while ((skb = __rtskb_dequeue(&ts->retransmit_queue)) != NULL) kfree_rtskb(skb); } /*** * rt_tcp_close */ static void rt_tcp_close(struct rtdm_fd *fd) { struct tcp_socket *ts = rtdm_fd_to_private(fd); struct rt_tcp_dispatched_packet_send_cmd send_cmd; rtdm_lockctx_t context; int signal = 0; rtdm_lock_get_irqsave(&ts->socket_lock, context); ts->is_closed = 1; if (ts->tcp_state == TCP_ESTABLISHED || ts->tcp_state == TCP_SYN_RECV) { /* close() from ESTABLISHED */ send_cmd.ts = ts; send_cmd.flags = TCP_FLAG_FIN | TCP_FLAG_ACK; signal = rt_tcp_socket_invalidate(ts, TCP_FIN_WAIT1); rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtpc_dispatch_call(rt_tcp_dispatched_packet_send, 0, &send_cmd, sizeof(send_cmd), NULL, NULL); /* result is ignored */ /* Give the peer some time to reply to our FIN. Since it is not relevant what exactly causes the wait function to return its result is ignored. */ wait_for_completion_interruptible_timeout(&ts->fin_handshake, msecs_to_jiffies(close_timeout)); } else if (ts->tcp_state == TCP_CLOSE_WAIT) { /* Send FIN in CLOSE_WAIT */ send_cmd.ts = ts; send_cmd.flags = TCP_FLAG_FIN | TCP_FLAG_ACK; signal = rt_tcp_socket_invalidate(ts, TCP_LAST_ACK); rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtpc_dispatch_call(rt_tcp_dispatched_packet_send, 0, &send_cmd, sizeof(send_cmd), NULL, NULL); /* result is ignored */ /* Give the peer some time to reply to our FIN. Since it is not relevant what exactly causes the wait function to return its result is ignored. */ wait_for_completion_interruptible_timeout(&ts->fin_handshake, msecs_to_jiffies(close_timeout)); } else { /* rt_tcp_socket_validate() has not been called at all, hence socket state is TCP_SYN_SENT or TCP_LISTEN, or socket is in one of close states, hence rt_tcp_socket_invalidate() was called, but close() is called at first time */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); } if (signal) rt_tcp_socket_invalidate_signal(ts); rt_tcp_socket_destruct(ts); rt_socket_cleanup(fd); } /*** * rt_tcp_bind - bind socket to local address * @s: socket * @addr: local address */ static int rt_tcp_bind(struct rtdm_fd *fd, struct tcp_socket *ts, const struct sockaddr __user *addr, socklen_t addrlen) { struct sockaddr_in *usin, _usin; rtdm_lockctx_t context; int index; int bound = 0; int ret = 0; usin = rtnet_get_arg(fd, &_usin, addr, sizeof(_usin)); if (IS_ERR(usin)) return PTR_ERR(usin); if ((addrlen < (int)sizeof(struct sockaddr_in)) || ((usin->sin_port & tcp_auto_port_mask) == tcp_auto_port_start)) return -EINVAL; rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->tcp_state != TCP_CLOSE || ts->is_bound || ts->is_binding) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return -EINVAL; } ts->is_binding = 1; rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdm_lock_get_irqsave(&tcp_socket_base_lock, context); if ((index = ts->sock.prot.inet.reg_index) < 0) { /* socket is destroyed */ ret = -EBADF; goto unlock_out; } port_hash_del(ts); if (port_hash_insert(ts, usin->sin_addr.s_addr, usin->sin_port ?: index + tcp_auto_port_start)) { port_hash_insert(ts, ts->saddr, ts->sport); ret = -EADDRINUSE; goto unlock_out; } bound = 1; unlock_out: rtdm_lock_put_irqrestore(&tcp_socket_base_lock, context); rtdm_lock_get_irqsave(&ts->socket_lock, context); ts->is_bound = bound; ts->is_binding = 0; rtdm_lock_put_irqrestore(&ts->socket_lock, context); return ret; } /*** * rt_tcp_connect */ static int rt_tcp_connect(struct rtdm_fd *fd, struct tcp_socket *ts, const struct sockaddr __user *serv_addr, socklen_t addrlen) { struct sockaddr_in *usin, _usin; struct dest_route rt; rtdm_lockctx_t context; int ret; if (addrlen < (int)sizeof(struct sockaddr_in)) return -EINVAL; usin = rtnet_get_arg(fd, &_usin, serv_addr, sizeof(_usin)); if (IS_ERR(usin)) return PTR_ERR(usin); if (usin->sin_family != AF_INET) return -EAFNOSUPPORT; ret = rt_ip_route_output(&rt, usin->sin_addr.s_addr, ts->saddr); if (ret < 0) { /* no route to host */ return -ENETUNREACH; } rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->is_closed) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); ret = -EBADF; goto err_deref; } if (ts->tcp_state != TCP_CLOSE || ts->is_binding) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); ret = -EINVAL; goto err_deref; } if (ts->rt.rtdev == NULL) memcpy(&ts->rt, &rt, sizeof(rt)); else rtdev_dereference(rt.rtdev); ts->saddr = rt.rtdev->local_ip; ts->daddr = usin->sin_addr.s_addr; ts->dport = usin->sin_port; ts->sync.seq = rt_tcp_initial_seq(); ts->sync.ack_seq = 0; ts->sync.window = 4096; ts->sync.dst_window = 0; ts->tcp_state = TCP_SYN_SENT; rtdm_lock_put_irqrestore(&ts->socket_lock, context); /* Complete three-way handshake */ ret = rt_tcp_send(ts, TCP_FLAG_SYN); if (ret < 0) { rtdm_printk("rttcp: cann't send SYN\n"); return ret; } ret = rtdm_event_timedwait(&ts->conn_evt, rt_tcp_connection_timeout, NULL); if (unlikely(ret < 0)) switch (ret) { case -EWOULDBLOCK: case -ETIMEDOUT: case -EINTR: return ret; default: return -EBADF; } if (ts->tcp_state == TCP_SYN_SENT) { /* received conn_evt, but connection is not established */ return -ECONNREFUSED; } return ret; err_deref: rtdev_dereference(rt.rtdev); return ret; } /*** * rt_tcp_listen */ static int rt_tcp_listen(struct tcp_socket *ts, unsigned long backlog) { int ret; rtdm_lockctx_t context; /* Ignore backlog value, maximum number of queued connections is 1 */ rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->is_closed) { ret = -EBADF; goto unlock_out; } if (ts->tcp_state != TCP_CLOSE || ts->is_binding) { ret = -EINVAL; goto unlock_out; } ts->tcp_state = TCP_LISTEN; ret = 0; unlock_out: rtdm_lock_put_irqrestore(&ts->socket_lock, context); return ret; } /*** * rt_tcp_accept */ static int rt_tcp_accept(struct rtdm_fd *fd, struct tcp_socket *ts, struct sockaddr *addr, socklen_t __user *addrlen) { /* Return sockaddr, but bind it with rt_socket_init, so it would be possible to read/write from it in future, return valid file descriptor */ int ret; socklen_t *uaddrlen, _uaddrlen; struct sockaddr_in sin; nanosecs_rel_t timeout = ts->sock.timeout; rtdm_lockctx_t context; struct dest_route rt; uaddrlen = rtnet_get_arg(fd, &_uaddrlen, addrlen, sizeof(_uaddrlen)); if (IS_ERR(uaddrlen)) return PTR_ERR(uaddrlen); rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->is_accepting || ts->is_accepted) { /* socket is already accepted or is accepting a connection right now */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); return -EALREADY; } if (ts->tcp_state != TCP_LISTEN || *uaddrlen < sizeof(struct sockaddr_in)) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return -EINVAL; } ts->is_accepting = 1; rtdm_lock_put_irqrestore(&ts->socket_lock, context); ret = rtdm_event_timedwait(&ts->conn_evt, timeout, NULL); if (unlikely(ret < 0)) switch (ret) { case -ETIMEDOUT: case -EINTR: goto err; default: ret = -EBADF; goto err; } /* accept() reported about connection establishment */ ret = rt_ip_route_output(&rt, ts->daddr, ts->saddr); if (ret < 0) { /* strange, no route to host, keep status quo */ ret = -EPROTO; goto err; } if (addr) { sin.sin_family = AF_INET; sin.sin_port = ts->dport; sin.sin_addr.s_addr = ts->daddr; ret = rtnet_put_arg(fd, addr, &sin, sizeof(sin)); if (ret) { rtdev_dereference(rt.rtdev); ret = -EFAULT; goto err; } } rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->tcp_state != TCP_ESTABLISHED) { /* protocol error */ rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdev_dereference(rt.rtdev); ret = -EPROTO; goto err; } if (ts->rt.rtdev == NULL) memcpy(&ts->rt, &rt, sizeof(rt)); else rtdev_dereference(rt.rtdev); ts->is_accepted = 1; rtdm_lock_put_irqrestore(&ts->socket_lock, context); ret = rtdm_fd_ufd(rt_socket_fd(&ts->sock)); err: /* it is not critical to leave this unlocked due to single entry nature of accept() */ ts->is_accepting = 0; return ret; } /*** * rt_tcp_shutdown */ static int rt_tcp_shutdown(struct tcp_socket *ts, unsigned long how) { return -EOPNOTSUPP; } /*** * rt_tcp_setsockopt */ static int rt_tcp_setsockopt(struct rtdm_fd *fd, struct tcp_socket *ts, int level, int optname, const void *optval, socklen_t optlen) { /* uint64_t val; */ struct __kernel_old_timeval tv; rtdm_lockctx_t context; switch (optname) { case SO_KEEPALIVE: if (optlen < sizeof(unsigned int)) return -EINVAL; /* commented out, because current implementation transmits keepalive probes from interrupt context */ /* val = *(unsigned long*)optval; if (val) rt_tcp_keepalive_enable(ts); else rt_tcp_keepalive_disable(ts); */ return 0; case SO_SNDTIMEO_OLD: if (optlen < sizeof(tv)) return -EINVAL; if (rtdm_copy_from_user(fd, &tv, optval, sizeof(tv))) return -EFAULT; if (tv.tv_usec < 0 || tv.tv_usec >= 1000000) return -EDOM; rtdm_lock_get_irqsave(&ts->socket_lock, context); if (tv.tv_sec < 0) { ts->sk_sndtimeo = RTDM_TIMEOUT_NONE; rtdm_lock_put_irqrestore(&ts->socket_lock, context); return 0; } ts->sk_sndtimeo = RTDM_TIMEOUT_INFINITE; if (tv.tv_sec == 0 && tv.tv_usec == 0) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return 0; } if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / 1000000000ull - 1)) ts->sk_sndtimeo = (tv.tv_sec * 1000000 + tv.tv_usec) * 1000; rtdm_lock_put_irqrestore(&ts->socket_lock, context); return 0; case SO_REUSEADDR: /* to implement */ return -EOPNOTSUPP; } return -ENOPROTOOPT; } /*** * rt_tcp_getsockopt */ static int rt_tcp_getsockopt(struct rtdm_fd *fd, struct tcp_socket *ts, int level, int optname, void *optval, socklen_t *optlen) { int ret = 0; if (*optlen < sizeof(unsigned int)) return -EINVAL; switch (optname) { case SO_ERROR: ret = 0; /* used in nonblocking connect(), extend later */ break; default: ret = -ENOPROTOOPT; break; } return ret; } /*** * rt_tcp_ioctl */ static int rt_tcp_ioctl(struct rtdm_fd *fd, unsigned int request, void __user *arg) { struct tcp_socket *ts = rtdm_fd_to_private(fd); const struct _rtdm_setsockaddr_args *setaddr; struct _rtdm_setsockaddr_args _setaddr; const struct _rtdm_getsockaddr_args *getaddr; struct _rtdm_getsockaddr_args _getaddr; const struct _rtdm_getsockopt_args *getopt; struct _rtdm_getsockopt_args _getopt; const struct _rtdm_setsockopt_args *setopt; struct _rtdm_setsockopt_args _setopt; int in_rt; /* fast path for common socket IOCTLs */ if (_IOC_TYPE(request) == RTIOC_TYPE_NETWORK) return rt_socket_common_ioctl(fd, request, arg); in_rt = rtdm_in_rt_context(); switch (request) { case _RTIOC_BIND: setaddr = rtnet_get_arg(fd, &_setaddr, arg, sizeof(_setaddr)); if (IS_ERR(setaddr)) return PTR_ERR(setaddr); return rt_tcp_bind(fd, ts, setaddr->addr, setaddr->addrlen); case _RTIOC_CONNECT: if (!in_rt) return -ENOSYS; setaddr = rtnet_get_arg(fd, &_setaddr, arg, sizeof(_setaddr)); if (IS_ERR(setaddr)) return PTR_ERR(setaddr); return rt_tcp_connect(fd, ts, setaddr->addr, setaddr->addrlen); case _RTIOC_LISTEN: return rt_tcp_listen(ts, (unsigned long)arg); case _RTIOC_ACCEPT: if (!in_rt) return -ENOSYS; getaddr = rtnet_get_arg(fd, &_getaddr, arg, sizeof(_getaddr)); if (IS_ERR(getaddr)) return PTR_ERR(getaddr); return rt_tcp_accept(fd, ts, getaddr->addr, getaddr->addrlen); case _RTIOC_SHUTDOWN: return rt_tcp_shutdown(ts, (unsigned long)arg); case _RTIOC_SETSOCKOPT: setopt = rtnet_get_arg(fd, &_setopt, arg, sizeof(_setopt)); if (IS_ERR(setopt)) return PTR_ERR(setopt); if (setopt->level != SOL_SOCKET) break; return rt_tcp_setsockopt(fd, ts, setopt->level, setopt->optname, setopt->optval, setopt->optlen); case _RTIOC_GETSOCKOPT: getopt = rtnet_get_arg(fd, &_getopt, arg, sizeof(_getopt)); if (IS_ERR(getopt)) return PTR_ERR(getopt); if (getopt->level != SOL_SOCKET) break; return rt_tcp_getsockopt(fd, ts, getopt->level, getopt->optname, getopt->optval, getopt->optlen); default: break; } return rt_ip_ioctl(fd, request, arg); } /*** * rt_tcp_read */ static ssize_t rt_tcp_read(struct rtdm_fd *fd, void *buf, size_t nbyte) { struct tcp_socket *ts = rtdm_fd_to_private(fd); struct rtsocket *sock = &ts->sock; struct rtskb *skb; struct rtskb *first_skb; nanosecs_rel_t timeout = sock->timeout; size_t data_len; size_t th_len; size_t copied = 0; size_t block_size; u8 *user_buf = buf; int ret; rtdm_lockctx_t context; rtdm_toseq_t timeout_seq; if (!rtdm_fd_is_user(fd)) { return -EFAULT; } rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->is_closed) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return -EBADF; } if (!ts->is_valid) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return 0; } if (ts->tcp_state != TCP_ESTABLISHED && ts->tcp_state != TCP_FIN_WAIT2) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return -EINVAL; } rtdm_lock_put_irqrestore(&ts->socket_lock, context); rtdm_toseq_init(&timeout_seq, timeout); while (copied < nbyte) { ret = rtdm_sem_timeddown(&ts->sock.pending_sem, timeout, &timeout_seq); if (unlikely(ret < 0)) switch (ret) { case -EWOULDBLOCK: case -ETIMEDOUT: case -EINTR: return (copied ? copied : ret); case -EIDRM: /* event is destroyed */ if (ts->is_closed) return -EBADF; return copied; default: if (ts->is_closed) { return -EBADF; } return 0; } skb = rtskb_dequeue_chain(&sock->incoming); RTNET_ASSERT(skb != NULL, return -EFAULT;); th_len = (skb->h.th->doff) << 2; data_len = skb->len - th_len; __rtskb_pull(skb, th_len); first_skb = skb; /* iterate over all IP fragments */ iterate_fragments: block_size = skb->len; copied += block_size; data_len -= block_size; if (copied > nbyte) { block_size -= copied - nbyte; copied = nbyte; if (rtdm_copy_to_user(fd, user_buf, skb->data, block_size)) { kfree_rtskb(first_skb); /* or store the data? */ return -EFAULT; } rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->sync.window) { ts->sync.window += block_size; rtdm_lock_put_irqrestore(&ts->socket_lock, context); } else { ts->sync.window = block_size; rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_ACK); /* window update */ } __rtskb_pull(skb, block_size); __rtskb_push(first_skb, sizeof(struct tcphdr)); first_skb->h.th->doff = 5; rtskb_queue_head(&sock->incoming, first_skb); rtdm_sem_up(&ts->sock.pending_sem); return copied; } if (rtdm_copy_to_user(fd, user_buf, skb->data, block_size)) { kfree_rtskb(first_skb); /* or store the data? */ return -EFAULT; } rtdm_lock_get_irqsave(&ts->socket_lock, context); if (ts->sync.window) { ts->sync.window += block_size; rtdm_lock_put_irqrestore(&ts->socket_lock, context); } else { ts->sync.window = block_size; rtdm_lock_put_irqrestore(&ts->socket_lock, context); rt_tcp_send(ts, TCP_FLAG_ACK); /* window update */ } if ((skb = skb->next) != NULL) { user_buf += data_len; goto iterate_fragments; } kfree_rtskb(first_skb); } return copied; } /*** * rt_tcp_write */ static ssize_t rt_tcp_write(struct rtdm_fd *fd, const void __user *user_buf, size_t nbyte) { struct tcp_socket *ts = rtdm_fd_to_private(fd); uint32_t sent_len = 0; rtdm_lockctx_t context; int ret = 0; nanosecs_rel_t sk_sndtimeo; void *buf; if (!rtdm_fd_is_user(fd)) { return -EFAULT; } rtdm_lock_get_irqsave(&ts->socket_lock, context); sk_sndtimeo = ts->sk_sndtimeo; if (!ts->is_valid) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return -EPIPE; } if ((ts->daddr | ts->dport) == 0 || ts->tcp_state != TCP_ESTABLISHED) { rtdm_lock_put_irqrestore(&ts->socket_lock, context); return -EINVAL; } rtdm_lock_put_irqrestore(&ts->socket_lock, context); buf = xnmalloc(nbyte); if (buf == NULL) return -ENOMEM; ret = rtdm_copy_from_user(fd, buf, user_buf, nbyte); if (ret) { xnfree(buf); return ret; } while (sent_len < nbyte) { ret = rtdm_event_timedwait(&ts->send_evt, sk_sndtimeo, NULL); if (unlikely(ret < 0)) switch (ret) { case -EWOULDBLOCK: case -ETIMEDOUT: case -EINTR: xnfree(buf); return sent_len ?: ret; case -EIDRM: /* event is destroyed */ default: if (ts->is_closed) { xnfree(buf); return -EBADF; } xnfree(buf); return sent_len ?: ret; } ret = rt_tcp_window_send(ts, nbyte - sent_len, ((u8 *)buf) + sent_len); if (ret < 0) { /* check this branch correctness */ rtdm_event_signal(&ts->send_evt); break; } sent_len += ret; if (ts->sync.dst_window) rtdm_event_signal(&ts->send_evt); } xnfree(buf); return (ret < 0 ? ret : sent_len); } /*** * rt_tcp_recvmsg */ static ssize_t rt_tcp_recvmsg(struct rtdm_fd *fd, struct user_msghdr *msg, int msg_flags) { struct iovec iov_fast[RTDM_IOV_FASTMAX], *iov; ssize_t ret; size_t len; void *buf; if (msg_flags) return -EOPNOTSUPP; /* loop over all vectors to be implemented */ if (msg->msg_iovlen != 1) return -EOPNOTSUPP; ret = rtdm_get_iovec(fd, &iov, msg, iov_fast); if (ret) return ret; len = iov[0].iov_len; if (len > 0) { buf = iov[0].iov_base; ret = rt_tcp_read(fd, buf, len); } rtdm_drop_iovec(iov, iov_fast); return ret; } /*** * rt_tcp_sendmsg */ static ssize_t rt_tcp_sendmsg(struct rtdm_fd *fd, const struct user_msghdr *msg, int msg_flags) { struct iovec iov_fast[RTDM_IOV_FASTMAX], *iov; ssize_t ret; size_t len; if (msg_flags) return -EOPNOTSUPP; /* loop over all vectors to be implemented */ if (msg->msg_iovlen != 1) return -EOPNOTSUPP; ret = rtdm_get_iovec(fd, &iov, msg, iov_fast); if (ret) return ret; len = iov[0].iov_len; if (len > 0) ret = rt_tcp_write(fd, iov[0].iov_base, len); rtdm_drop_iovec(iov, iov_fast); return ret; } /*** * rt_tcp_select */ static int rt_tcp_select(struct rtdm_fd *fd, rtdm_selector_t *selector, enum rtdm_selecttype type, unsigned fd_index) { struct tcp_socket *ts = rtdm_fd_to_private(fd); switch (type) { case XNSELECT_READ: return rtdm_sem_select(&ts->sock.pending_sem, selector, XNSELECT_READ, fd_index); case XNSELECT_WRITE: return rtdm_event_select(&ts->send_evt, selector, XNSELECT_WRITE, fd_index); default: return -EBADF; } return -EINVAL; } /*** * TCP-Initialisation */ static struct rtinet_protocol tcp_protocol = { .protocol = IPPROTO_TCP, .dest_socket = &rt_tcp_dest_socket, .rcv_handler = &rt_tcp_rcv, .err_handler = &rt_tcp_rcv_err, .init_socket = &rt_tcp_socket }; static struct rtdm_driver tcp_driver = { .profile_info = RTDM_PROFILE_INFO(tcp, RTDM_CLASS_NETWORK, RTDM_SUBCLASS_RTNET, RTNET_RTDM_VER), .device_flags = RTDM_PROTOCOL_DEVICE, .device_count = 1, .context_size = sizeof(struct tcp_socket), .protocol_family = PF_INET, .socket_type = SOCK_STREAM, .ops = { .socket = rt_inet_socket, .close = rt_tcp_close, .ioctl_rt = rt_tcp_ioctl, .ioctl_nrt = rt_tcp_ioctl, .read_rt = rt_tcp_read, .write_rt = rt_tcp_write, .recvmsg_rt = rt_tcp_recvmsg, .sendmsg_rt = rt_tcp_sendmsg, .select = rt_tcp_select, }, }; static struct rtdm_device tcp_device = { .driver = &tcp_driver, .label = "tcp", }; #ifdef CONFIG_XENO_OPT_VFILE /*** * rt_tcp_proc_read */ static inline char *rt_tcp_string_of_state(u8 state) { switch (state) { case TCP_ESTABLISHED: return "ESTABLISHED"; case TCP_SYN_SENT: return "SYN_SENT"; case TCP_SYN_RECV: return "SYN_RECV"; case TCP_FIN_WAIT1: return "FIN_WAIT1"; case TCP_FIN_WAIT2: return "FIN_WAIT2"; case TCP_TIME_WAIT: return "TIME_WAIT"; case TCP_CLOSE: return "CLOSE"; case TCP_CLOSE_WAIT: return "CLOSE_WAIT"; case TCP_LAST_ACK: return "LASK_ACK"; case TCP_LISTEN: return "LISTEN"; case TCP_CLOSING: return "CLOSING"; default: return "UNKNOWN"; } } static int rtnet_ipv4_tcp_show(struct xnvfile_regular_iterator *it, void *data) { rtdm_lockctx_t context; struct tcp_socket *ts; u32 saddr, daddr; u16 sport = 0, dport = 0; /* set to 0 to silence compiler */ char sbuffer[24]; char dbuffer[24]; int state; int index; xnvfile_printf(it, "Hash Local Address " "Foreign Address State\n"); for (index = 0; index < RT_TCP_SOCKETS; index++) { rtdm_lock_get_irqsave(&tcp_socket_base_lock, context); ts = port_registry[index]; state = ts ? ts->tcp_state : TCP_CLOSE; if (ts && ts->tcp_state != TCP_CLOSE) { saddr = ts->saddr; sport = ts->sport; daddr = ts->daddr; dport = ts->dport; } rtdm_lock_put_irqrestore(&tcp_socket_base_lock, context); if (state != TCP_CLOSE) { snprintf(sbuffer, sizeof(sbuffer), "%u.%u.%u.%u:%u", NIPQUAD(saddr), ntohs(sport)); snprintf(dbuffer, sizeof(dbuffer), "%u.%u.%u.%u:%u", NIPQUAD(daddr), ntohs(dport)); xnvfile_printf(it, "%04X %-23s %-23s %s\n", sport & port_hash_mask, sbuffer, dbuffer, rt_tcp_string_of_state(state)); } } return 0; } static struct xnvfile_regular_ops rtnet_ipv4_tcp_vfile_ops = { .show = rtnet_ipv4_tcp_show, }; static struct xnvfile_regular rtnet_ipv4_tcp_vfile = { .ops = &rtnet_ipv4_tcp_vfile_ops, }; /*** * rt_tcp_proc_register */ static int __init rt_tcp_proc_register(void) { return xnvfile_init_regular("tcp", &rtnet_ipv4_tcp_vfile, &ipv4_proc_root); } /*** * rt_tcp_proc_unregister */ static void rt_tcp_proc_unregister(void) { xnvfile_destroy_regular(&rtnet_ipv4_tcp_vfile); } #endif /* CONFIG_XENO_OPT_VFILE */ /*** * rt_tcp_init */ int __init rt_tcp_init(void) { unsigned int skbs; int i; int ret; if ((tcp_auto_port_start < 0) || (tcp_auto_port_start >= 0x10000 - RT_TCP_SOCKETS)) tcp_auto_port_start = 1024; tcp_auto_port_start = htons(tcp_auto_port_start & (tcp_auto_port_mask & 0xFFFF)); tcp_auto_port_mask = htons(tcp_auto_port_mask | 0xFFFF0000); for (i = 0; i < ARRAY_SIZE(port_hash); i++) INIT_HLIST_HEAD(&port_hash[i]); /* Perform essential initialization of the RST|ACK socket */ skbs = rt_bare_socket_init(rst_fd, IPPROTO_TCP, RT_TCP_RST_PRIO, RT_TCP_RST_POOL_SIZE); if (skbs < RT_TCP_RST_POOL_SIZE) printk("rttcp: allocated only %d RST|ACK rtskbs\n", skbs); rst_socket.sock.prot.inet.tos = 0; rst_fd->refs = 1; rtdm_lock_init(&rst_socket.socket_lock); /* * 100 ms forwarding timer with 8.38 ms slots */ ret = timerwheel_init(100000000ull, 23); if (ret < 0) { rtdm_printk("rttcp: cann't initialize timerwheel task: %d\n", -ret); goto out_1; } #ifdef CONFIG_XENO_OPT_VFILE if ((ret = rt_tcp_proc_register()) < 0) { rtdm_printk("rttcp: cann't initialize proc entry: %d\n", -ret); goto out_2; } #endif /* CONFIG_XENO_OPT_VFILE */ rt_inet_add_protocol(&tcp_protocol); ret = rtdm_dev_register(&tcp_device); if (ret < 0) { rtdm_printk("rttcp: cann't register RT TCP: %d\n", -ret); goto out_3; } return ret; out_3: rt_inet_del_protocol(&tcp_protocol); #ifdef CONFIG_XENO_OPT_VFILE rt_tcp_proc_unregister(); #endif /* CONFIG_XENO_OPT_VFILE */ out_2: timerwheel_cleanup(); out_1: rt_bare_socket_cleanup(&rst_socket.sock); return ret; } /*** * rt_tcp_release */ void __exit rt_tcp_release(void) { rt_inet_del_protocol(&tcp_protocol); #ifdef CONFIG_XENO_OPT_VFILE rt_tcp_proc_unregister(); #endif /* CONFIG_XENO_OPT_VFILE */ timerwheel_cleanup(); rt_bare_socket_cleanup(&rst_socket.sock); rtdm_dev_unregister(&tcp_device); } module_init(rt_tcp_init); module_exit(rt_tcp_release);