/*
  Copyright Mission Critical Linux, 2000

  Kimberlite is free software; you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
  Free Software Foundation; either version 2, or (at your option) any
  later version.

  Kimberlite is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with Kimberlite; see the file COPYING.  If not, write to the
  Free Software Foundation, Inc.,  675 Mass Ave, Cambridge, 
  MA 02139, USA.
*/
/*
 *  $Id: diskrawio.c,v 1.9 2000/09/07 23:25:49 winchell Exp $
 *
 *  Copyright (C) 2000 Mission Critical Linux, LLC
 *
 *  author: Tim Burke <burke@missioncriticallinux.com>
 *  description: Raw IO Interfaces.
 *
 * The RAW IO code we are using from 2.2.13 requires user buffers and
 * disk offsets to be 512 byte aligned.  So this code consists of a 
 * read and write routine which check to see if the user buffer is 
 * aligned.  If it isn't a temporary aligned buffer is allocated, a data
 * copy is performed along with the IO operation itself.
 */
#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <sys/file.h>   
#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
#include "diskstate.h"
#include "disk_proto.h"
#include <logger.h>
#include <sys/syslog.h>

static const char *version __attribute__ ((unused)) = "$Id: diskrawio.c,v 1.9 2000/09/07 23:25:49 winchell Exp $";

static int zfd = -1;
static int pageSize = MAX_BOUNCEIO_LENGTH;
ulong bounceioReads = 0;
ulong bounceioWrites = 0;
ulong alignedReads = 0;
ulong alignedWrites = 0;
extern int sharedPartitionFDinited;
extern int sharedPartitionFD[];
int preferredReadPartition = -1;

/*
 * Forward routine declarations.
 */
int initSharedFD(void);
int diskRawRead(int fd, char *buf, int len);
int diskRawWrite(int fd, char *buf, int len);
int clu_write_checksum(char *alignedBuf, int len, ulong chksum_off);
int clu_check_checksum(char *alignedBuf, int len, ulong chksum_off);

/*
 * This initialization routine must be called prior to any of the
 * others to set things up.
 */
int initAlignedBufStuff(void) {
	if (zfd != -1) {
	    return(0);
	}
        pageSize = sysconf(_SC_PAGESIZE);

        zfd = open("/dev/zero", O_RDWR);
        if (zfd == -1) {
	    clulog(LOG_ERR, "initAlignedBufStuff: unable to open /dev/zero.\n");
            return(-1);
        }
	return(0);
}

int deinitAlignedBufStuff(void) {
	int ret;

	ret = close(zfd);
	zfd = -1;
	return(ret);
}

char * allocAlignedBuf(void) {
    char *alignedBuffer;

    if(initAlignedBufStuff())
            return MAP_FAILED;

    alignedBuffer = mmap(0, pageSize, PROT_READ|PROT_WRITE, MAP_PRIVATE, zfd, 0);
#ifdef DISKSTATE_DEBUG
    bzero((void *)alignedBuffer, pageSize); 
#endif // DISKSTATE_DEBUG

    if (alignedBuffer == MAP_FAILED) {
        clulog(LOG_ERR, "allocAlignedBuf: mmap failed.\n");
    }
    return(alignedBuffer);
}

int freeAlignedBuf(char *buf) {
    int ret;

    ret = munmap(buf, pageSize);
    if (ret < 0) {
        clulog(LOG_ERR, "freeAlignedBuf: munmap failed.\n");
    }
    return(ret);
}
void setPreferredReadPartition(int partition)
{
	if(partition == 0 || partition == 1
	   || partition == -1)
		preferredReadPartition = partition;
}
void flipPreferredReadPartition(void)
{
	if(preferredReadPartition == -1)
		preferredReadPartition = 0;
	else if(preferredReadPartition == 0 || preferredReadPartition == 1)
		preferredReadPartition ^= 1;
	else
		preferredReadPartition = 0;
}
/*
 * Read in the requested chunk of data.  Since the quorum partition is 
 * shadowed there are 2 possible sources.  Perform a read and if it fails
 * issue a read to the other partition and re-write the data onto the 
 * failed partition as a means of self-healing.  In order to spread around
 * the reads across multiple partitions, we read randomly from either
 * partition first.
 *
 * If it is not safe to make the repair, repair_ok set to 0.
 * This is used for the lock code, which calls diskRawReadShadow().
 * Since the lock is read inside diskRawReadShadow(), there is an unending recursion.
 *
 *  Repair_ok set to 1 means that it is safe, if the lock is held.  For example, the service manager
 *  might read the service state in a lazy way (no lock) or in a serious way (lock held).
 *  For the former it is not safe to make the repair as it might corrupt the state for
 *  a lock holder.  For the latter it is safe, as the only one to get corrupt is the lazy
 *  reader.
 */
int diskRawReadShadow(off_t readOffset, char *buf, int len, ulong chksum_off, int repair_ok)
{
	int part, tries;
	int status, ret;


	if ((readOffset < 0) || (len < 0)) {
            clulog(LOG_ERR, "diskRawReadShadow: readOffset=%ld, len=%ld.\n",
		readOffset, len);
	    return(-1);
	}
	if(!sharedPartitionFDinited)
		if(initSharedFD())
			return -1;
	// First decide with of the 2 partitions to start with.
	if(preferredReadPartition != -1)
		part = preferredReadPartition;
	else
		part = random() & 1;
	ret = diskLseekRawReadChecksum(part, readOffset, buf, len, chksum_off);
	if(ret == SHADOW_NO_ACCESS) {
		clulog(LOG_ERR, "diskRawReadShadow: shadow read failed.\n");
		return -1;
	}
	if(ret == SHADOW_SUCCESS) {
		return 0;
	}

	/* Checksum failure.  */

	if(repair_ok && !test_clu_lock_held())
		repair_ok = 0;
		
	tries = 0;

	part = part ^ 1;
  top:
	// Read from the other partition
	status = diskLseekRawReadChecksum(part, readOffset, buf, len, chksum_off);
	if(status == SHADOW_NO_ACCESS) {
		clulog(LOG_EMERG, "diskRawReadShadow: no acces to quorum device.\n");
		goto fail;
	}
	if(status == SHADOW_SUCCESS && !repair_ok) {
		clulog(LOG_EMERG, "diskRawReadShadow: skipping repair.\n");
		return 0;
	}

	if(status != SHADOW_SUCCESS) {
		tries++;
		if(tries > 1) {
			clulog(LOG_EMERG, "diskRawReadShadow: checksums bad on both partitions");
			goto fail;
		}
		part = part ^ 1;
		goto top;
	}
	// Repair damaged partition.
	ret = diskRawWriteShadow(readOffset, buf, len, chksum_off);
	if(ret) {
		clulog(LOG_EMERG, "diskRawReadShadow: failed repair offset %d, length %d.\n",
		       readOffset, len);
	}

	return ret;

  fail:
	return -1;
}
int diskLseekRawReadChecksum(int partition, off_t readOffset, char *buf, int len, ulong chksum_off)
{
	int ret;

	ret = lseek(sharedPartitionFD[partition], readOffset, SEEK_SET);
	if (ret != readOffset) {
	   	 clulog(LOG_ERR, 
		"diskLseekRawReadChecksum: can't seek to offset %d.\n",
			(int)readOffset);
		return(SHADOW_NO_ACCESS);
	}
	ret = diskRawRead(sharedPartitionFD[partition], buf, len);
	if (ret != len) {
		clulog(LOG_ERR, 
		"diskLseekRawReadChecksum: aligned read returned %d, not %d.\n",
			ret, len);
		return(SHADOW_NO_ACCESS);
	} 
	if(clu_check_checksum(buf, len, chksum_off)) {
		clulog(LOG_EMERG,"diskLseekRawReadChecksum: bad check sum, part = %d offset = %d len = %d\n",
		       partition, (int)readOffset, len);
		return SHADOW_BAD_CHECKSUM;
	}
	return SHADOW_SUCCESS;
}


/*
 * The RAW IO implementation requires buffers to be 512 byte aligned.
 * Here we check for alignment and do a bounceio if necessary.
 */
int diskRawRead(int fd, char *buf, int len) {
    char *alignedBuf;
    int readret;
    int extraLength;
    int readlen;
    int bounceNeeded = 1;

    if (zfd < 0) {
	clulog(LOG_ERR, "diskRawRead: initAlignedBufStuff not called!\n");
	return(-1);
    }
    if ((((unsigned long)buf & (unsigned long)0x3ff) == 0) &&
        ((len % 512) == 0)) {
	bounceNeeded = 0;
    }
    if (bounceNeeded == 0) {
	// Already aligned and even multiple of 512, no bounceio required.
	alignedReads++;
	return(read(fd, buf, len));
    }
    else {
        if (len > pageSize) {
	    clulog(LOG_ERR, "diskRawRead: not setup for reads larger than %d.\n",
			pageSize);
	    return(-1);
        }
	/*
 	 * All IOs must be of size which is a multiple of 512.  Here we
	 * just add in enough extra to accommodate.
	 * XXX - if the on-disk offsets don't provide enough room we're cooked!
	 */
	extraLength = 0;
	if (len % 512) {
	    extraLength = 512 - (len % 512);
	}
	readlen = len;
	if (extraLength) {
	    readlen += extraLength;
	}
	bounceioReads++;
	alignedBuf = allocAlignedBuf();
	if (alignedBuf == MAP_FAILED) {
	    return(-1);
	}
	readret = read(fd, alignedBuf, readlen);
	if (readret > 0) {
	    if (readret > len) {
	        bcopy(alignedBuf, buf, len);
		readret = len;
	    }
	    else {
	        bcopy(alignedBuf, buf, readret);
	    }
	}
	freeAlignedBuf(alignedBuf);
	if (readret != len) {
	    clulog(LOG_ERR, "diskRawRead: read err, len=%d, readret=%d\n",
		len,readret);
	}
	return(readret);
    }
}

int diskRawWriteShadow(off_t writeOffset, char *buf, int len, ulong chksum_off)
{
	off_t retval_seek;
	ssize_t retval_write;
	int i;

	if ((writeOffset < 0) || (len < 0)) {
            clulog(LOG_ERR, "diskRawWriteShadow: writeOffset=%ld, chksum_off=%ld, len=%ld.\n",
		writeOffset, chksum_off, len);
	    return(-1);
	}
	if(!sharedPartitionFDinited) {
	    if(initSharedFD()) {
		return -1;
	    }
	}

	if(clu_write_checksum(buf, len, chksum_off)) {
	    clulog(LOG_ERR, "diskRawWriteShadow: unable to write check sum.\n");
	    return(-1);
	}
	for(i = 0; i < 2; i++) {
	    retval_seek = lseek(sharedPartitionFD[i], writeOffset, SEEK_SET);
	    if (retval_seek != writeOffset) {
		clulog(LOG_ERR, "diskRawWriteShadow: can't seek to offset %d\n",
				(int)writeOffset);
		return(-1);
	    }
	    retval_write = diskRawWrite(sharedPartitionFD[i], buf, len);
	    if (retval_write != len) {
		clulog(LOG_ERR, 
		"diskRawWriteShadow: aligned write returned %d, not %d.\n",
			retval_write, len);
		return(-1);
	    } 
	}
	return 0;
}
/* Same as diskRawWriteShadow, except to named partition.  Only used by
 * config database, which handles writes to shadow.  Config database is treated
 * differently because its size is greater than one block and hence, not atomic.
 */
int diskRawWritePartition(int partition, off_t writeOffset, char *buf, int len, ulong chksum_off)
{
	off_t retval_seek;
	ssize_t retval_write;

	if ((writeOffset < 0) || (len < 0)) {
            clulog(LOG_ERR, "diskRawWriteShadow: writeOffset=%ld, chksum_off=%ld, len=%ld.\n",
		writeOffset, chksum_off, len);
	    return(-1);
	}
	if(!sharedPartitionFDinited) {
	    if(initSharedFD()) {
		return -1;
	    }
	}

	if(clu_write_checksum(buf, len, chksum_off)) {
	    clulog(LOG_ERR, "diskRawWriteShadow: unable to write check sum.\n");
	    return(-1);
	}
	retval_seek = lseek(sharedPartitionFD[partition], writeOffset, SEEK_SET);
	if (retval_seek != writeOffset) {
		clulog(LOG_ERR, "diskRawWriteShadow: can't seek to offset %d\n",
		       (int)writeOffset);
		return(-1);
	}
	retval_write = diskRawWrite(sharedPartitionFD[partition], buf, len);
	if (retval_write != len) {
		clulog(LOG_ERR, 
		       "diskRawWriteShadow: aligned write returned %d, not %d.\n",
		       retval_write, len);
		return(-1);
	} 

	return 0;
}

/*
 * The RAW IO implementation requires buffers to be 512 byte aligned.
 * Here we check for alignment and do a bounceio if necessary.
 */

int diskRawWrite(int fd, char *buf, int len) {
    char *alignedBuf;
    int ret;
    int extraLength;
    int writelen;
    int bounceNeeded = 1;

    if (zfd < 0) {
	clulog(LOG_ERR, "diskRawWrite: initAlignedBufStuff not called!\n");
	return(-1);
    }
    if ((((unsigned long)buf & (unsigned long)0x3ff) == 0) &&
        ((len % 512) == 0)) {
	bounceNeeded = 0;
    }
    if (bounceNeeded == 0) {
	// Already aligned and even multiple of 512, no bounceio required.
	alignedWrites++;
	return(write(fd, buf, len));
    }
    else {
        if (len > pageSize) {
	    clulog(LOG_ERR, "diskRawWrite: not setup for larger than %d.\n",
			pageSize);
	    return(-1);
        }
	/*
 	 * All IOs must be of size which is a multiple of 512.  Here we
	 * just add in enough extra to accommodate.
	 * XXX - if the on-disk offsets don't provide enough room we're cooked!
	 */
	extraLength = 0;
	if (len % 512) {
	    extraLength = 512 - (len % 512);
	}
	writelen = len;
	if (extraLength) {
	    writelen += extraLength;
	}
	bounceioWrites++;
	alignedBuf = allocAlignedBuf();
	if (alignedBuf == MAP_FAILED) {
	    return(-1);
	}
	bcopy(buf, alignedBuf, len);
	ret = write(fd, alignedBuf, writelen);
	if (ret > len) {
	    ret = len;
	}
	freeAlignedBuf(alignedBuf);
	if (ret != len) {
	    clulog(LOG_ERR, "diskRawWrite: write err, len=%d, ret=%dn",
		len,ret);
	}
	return(ret);
    }
}

void printRawIOStats(void) {
    clulog(LOG_DEBUG, "\nRaw IO Statistics\n");
    clulog(LOG_DEBUG, "Aligned IOs: reads=%ld, writes=%ld\n",alignedReads, alignedWrites);
    clulog(LOG_DEBUG, "Bounce IOs: reads=%ld, writes=%ld\n",bounceioReads, bounceioWrites);
}

